**PRE MERGE QUALITY CHECKS**


In [None]:
import pandas as pd

def check_missing_values(df):
    """Returns columns with missing values and their counts."""
    missing = df.isnull().sum()
    return missing[missing > 0]

def check_data_types(df):
    """Returns the data types of each column."""
    return df.dtypes

def check_duplicates(df):
    """Returns the number of duplicate rows."""
    return df.duplicated().sum()


In [None]:
def handle_missing_values(df, strategy='drop', fill_value=None):
    """
    Handles missing values in the DataFrame.
    - strategy: 'drop' to remove rows with missing values, 'fill' to impute.
    - fill_value: Value to fill missing data if strategy is 'fill'.
    """
    
    if strategy == 'drop':
        return df.dropna()
    elif strategy == 'fill' and fill_value is not None:
        return df.fillna(fill_value)
    else:
        raise ValueError("Invalid strategy or fill_value not provided.")

def convert_data_types(df, conversions):
    """
    Converts columns to specified data types.
    - conversions: Dictionary with column names as keys and target data types as values.
    """
    for col, dtype in conversions.items():
        df[col] = df[col].astype(dtype, errors='ignore')
    return df

def remove_duplicates(df):
    """Removes duplicate rows from the DataFrame."""
    return df.drop_duplicates()


In [None]:
# List of datasets with their respective file paths
datasets = {
    'Orders': '/kaggle/input/brazilian-ecommerce/olist_orders_dataset.csv',
    'Order Items': '/kaggle/input/brazilian-ecommerce/olist_order_items_dataset.csv',
    'Order Payments': '/kaggle/input/brazilian-ecommerce/olist_order_payments_dataset.csv',
    'Order Reviews': '/kaggle/input/brazilian-ecommerce/olist_order_reviews_dataset.csv',
    'Products': '/kaggle/input/brazilian-ecommerce/olist_products_dataset.csv',
    'Sellers': '/kaggle/input/brazilian-ecommerce/olist_sellers_dataset.csv',
    'Category Translation': '/kaggle/input/brazilian-ecommerce/product_category_name_translation.csv'
}

# Dictionary to store cleaned DataFrames
cleaned_dataframes = {}

# Iterate over each dataset
for name, file_path in datasets.items():
    print(f"Processing {name} dataset...")
    
    # Load the dataset
    df = pd.read_csv(file_path)

    # Get the shape of the DataFrame
    rows, columns = df.shape
    print(f"The dataset contains {rows} rows and {columns} columns.")
    
    # Perform quality checks
    missing_values = check_missing_values(df)
    data_types = check_data_types(df)
    duplicate_count = check_duplicates(df)
    
    # Display quality check results
    print(f"Missing Values:\n{missing_values}\n")
    print(f"Data Types:\n{data_types}\n")
    print(f"Number of Duplicates: {duplicate_count}\n")
    
    # Data Cleaning Steps
    # Handle missing values

    # Drop specific columns in "Order Reviews" dataset with high missing values
    if name == 'Order Reviews':
        columns_to_drop = ['review_comment_title', 'review_comment_message']
        # Drop only if they exist in the dataset
        missing_columns = [col for col in columns_to_drop if col in missing_values and missing_values[col] > 0]
        if missing_columns:
            print(f"Dropping columns with high missing values in {name}: {missing_columns}")
            df.drop(missing_columns, axis=1, inplace=True)
    elif not missing_values.empty:
        # Example: Drop rows with missing values
        df = handle_missing_values(df, strategy='drop')
    
    # Convert data types
    # Example: Convert date columns to datetime
    date_columns = [col for col in df.columns if 'date' in col]
    conversions = {col: 'datetime64[ns]' for col in date_columns}
    df = convert_data_types(df, conversions)
    
    # Remove duplicates
    if duplicate_count > 0:
        df = remove_duplicates(df)
    
    # Store the cleaned DataFrame
    cleaned_dataframes[name] = df
    print(f"{name} dataset cleaned and stored.\n")


    #RE-CHECK-------------------------

    # Get the shape of the DataFrame
    rows, columns = df.shape
    print(f"The dataset contains {rows} rows and {columns} columns.")
    
    # Perform quality checks
    missing_values = check_missing_values(df)
    data_types = check_data_types(df)
    duplicate_count = check_duplicates(df)
    
    # Display quality check results
    print(f"Missing Values:\n{missing_values}\n")
    print(f"Data Types:\n{data_types}\n")
    print(f"Number of Duplicates: {duplicate_count}\n")


In [None]:
#cleaned dataframes
keys = cleaned_dataframes.keys()
print("Keys in 'cleaned_dataframes':", keys)

# Verify the first few rows of each cleaned dataset
for name, df in cleaned_dataframes.items():
    print(f"First few rows of {name} dataset:")
    print(df.head(), "\n")

# Optionally, save cleaned datasets to new CSV files
for name, df in cleaned_dataframes.items():
    cleaned_file_path = f"cleaned_{name.lower().replace(' ', '_')}.csv"
    df.to_csv(cleaned_file_path, index=False)
    print(f"Cleaned {name} dataset saved to {cleaned_file_path}.")


**Relevant Columns for Each Dataset**

In [None]:
# Define necessary columns for each dataset
required_columns = {
    'Orders': ['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp'],
    'Order Items': ['order_id', 'product_id', 'seller_id', 'price', 'freight_value'],
    'Order Payments': ['order_id', 'payment_type', 'payment_value'],
    'Products': ['product_id', 'product_category_name'],
    'Sellers': ['seller_id', 'seller_city', 'seller_state'],
    'Order Reviews': ['order_id', 'review_score'],
    'Category Translation': ['product_category_name', 'product_category_name_english']
}


**Filter Each Dataset for Necessary Columns**

In [None]:
# Dictionary to store filtered DataFrames
filtered_dataframes = {}

for name, columns in required_columns.items():
    # Select only the required columns from each cleaned DataFrame
    filtered_dataframes[name] = cleaned_dataframes[name][columns]
    print(f"Filtered columns for '{name}': {filtered_dataframes[name].columns.tolist()}")


**Merge the Filtered Datasets Dynamically**

In [None]:
# Define the merge sequence dynamically: (DataFrame name, key to merge on)
merge_sequence = [
    ('Orders', 'order_id'),
    ('Order Items', 'order_id'),
    ('Order Payments', 'order_id'),
    ('Products', 'product_id'),
    ('Category Translation', 'product_category_name'),
    ('Sellers', 'seller_id'),
    ('Order Reviews', 'order_id')
]

# Initialize the base DataFrame with the first DataFrame in the sequence
base_df_name, base_key = merge_sequence[0]
base_df = filtered_dataframes[base_df_name]
print(f"Starting with base DataFrame: {base_df_name}")

# Sequentially merge each filtered DataFrame
for df_name, key in merge_sequence[1:]:
    print(f"Merging '{df_name}' into '{base_df_name}' on key '{key}'")
    base_df = pd.merge(base_df, filtered_dataframes[df_name], on=key, how='left')
    print(f"Shape after merging '{df_name}': {base_df.shape}")

# Final merged DataFrame
final_merged_df = base_df
print(f"\nFinal merged DataFrame shape: {final_merged_df.shape}")


**Verification and Final Check**

In [None]:
# Display the first few rows of the final merged DataFrame
print(final_merged_df.head())

# Check for any remaining missing values
missing_values = final_merged_df.isnull().sum()
print("\nMissing values in the final merged DataFrame:\n", missing_values[missing_values > 0])


**POST MERGE QUALITY CHECKS**

*Missing Values*

In [None]:
missing_values = final_merged_df.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0])

total_rows = final_merged_df.shape[0]
missing_percentage = (missing_values / total_rows) * 100
print("Percentage of missing values per column:")
print(missing_percentage[missing_percentage > 0])

In [None]:
# Example: Dropping columns with more than 50% missing values
threshold = 50.0  # percentage
columns_to_drop = missing_percentage[missing_percentage > threshold].index

# Check and drop columns if any exceed the threshold
if len(columns_to_drop) > 0:
    print(f"Columns to drop (more than {threshold}% missing values): {list(columns_to_drop)}")
    final_merged_df.drop(columns=columns_to_drop, inplace=True)
else:
    print("No columns exceed the missing value threshold.")

In [None]:
# Calculate the threshold: number of columns minus allowed missing values
threshold = final_merged_df.shape[1] - 2  # Allow up to 2 missing values per row

# Drop rows with missing values exceeding the threshold
cleaned_df = final_merged_df.dropna(thresh=threshold)

# Display the number of rows removed
rows_removed = final_merged_df.shape[0] - cleaned_df.shape[0]
print(f"Number of rows removed: {rows_removed}")


*Validate Datatypes*

In [None]:
print("Data types before conversion:")
print(final_merged_df.dtypes)


In [None]:
# Example: Converting date columns to datetime
date_columns = ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date',
                'order_delivered_customer_date', 'order_estimated_delivery_date']
for col in date_columns:
    if col in final_merged_df.columns:
        final_merged_df[col] = pd.to_datetime(final_merged_df[col], errors='coerce')
print("Data types after conversion:")
print(final_merged_df.dtypes)


*Duplicate Values*

In [None]:
duplicate_rows = final_merged_df[final_merged_df.duplicated(keep=False)]
if not duplicate_rows.empty:
    print(f"Warning: Found {duplicate_rows.shape[0]} duplicate rows.")
else:
    print("No duplicate rows found.")


In [None]:
final_merged_df.drop_duplicates(inplace=True)
print("Duplicate rows removed.")


*Data Consistency*

Numerical Variables

In [None]:
# Define expected ranges for numerical columns
expected_ranges = {
    'price': (0, None),  # No negative prices, upper limit None (unbounded)
    'freight_value': (0, None),  # Freight value should be non-negative
    'payment_value': (0, None),  # Payment values should be non-negative
    'review_score': (1, 5)  # Assuming review scores are between 1 and 5
}

# Loop through numerical columns and validate their ranges
for col, dtype in final_merged_df.dtypes.items():
    if dtype == 'float64' or dtype == 'int64':  # Identifying numerical columns
        if col in expected_ranges:
            min_val, max_val = expected_ranges[col]
            
            # Check for values below the expected minimum
            if min_val is not None:
                below_min = final_merged_df[final_merged_df[col] < min_val]
                if not below_min.empty:
                    print(f"Warning: Found {below_min.shape[0]} entries in '{col}' below the minimum expected value of {min_val}. Setting them to {min_val}.")
                    final_merged_df.loc[final_merged_df[col] < min_val, col] = min_val
            
            # Check for values above the expected maximum
            if max_val is not None:
                above_max = final_merged_df[final_merged_df[col] > max_val]
                if not above_max.empty:
                    print(f"Warning: Found {above_max.shape[0]} entries in '{col}' above the maximum expected value of {max_val}. Setting them to {max_val}.")
                    final_merged_df.loc[final_merged_df[col] > max_val, col] = max_val
        else:
            print(f"No expected range specified for '{col}'. Please review this column manually if needed.")


Categorical Variables

In [None]:
import pandas as pd

# Columns that require standardization (categorical text columns)
categorical_columns = [
    'order_status', 'payment_type', 
    'product_category_name', 'product_category_name_english', 
    'seller_city', 'seller_state'
]

# Function to standardize categorical data
def standardize_categorical_data(df, columns):
    for col in columns:
        if col in df.columns:
            # Convert to lowercase, replace spaces with underscores, and remove special characters
            df[col] = df[col].str.lower().str.replace(' ', '_').str.replace(r'[^a-z0-9_]', '', regex=True)
    return df

# Applying the function to standardize selected categorical columns
final_merged_df = standardize_categorical_data(final_merged_df, categorical_columns)

# Verify the results
print("Standardized Categorical Columns:")
for col in categorical_columns:
    print(f"{col} unique values after standardization:\n{final_merged_df[col].unique()}\n")


Mapping if needed for categorical variables  ------- **CAUTION ---**

In [None]:
import sys
!{sys.executable} -m pip install --upgrade packaging

pip install --upgrade skrub

import sys
!{sys.executable} -m pip show packaging

!{sys.executable} -m pip show skrub



!{sys.executable} -m pip uninstall skrub -y packaging

pip install skrub packaging

import pandas as pd
from skrub import SimilarityEncoder

# Initialize the SimilarityEncoder
encoder = SimilarityEncoder(similarity='ngram', ngram_range=(2, 4), categories='auto')

# Function to encode and replace categorical columns
def encode_categorical_columns(df, columns):
    for col in columns:
        if col in df.columns:
            # Reshape the column to a 2D array as required by the encoder
            col_data = df[[col]].astype(str).values
            # Fit and transform the data
            encoded_data = encoder.fit_transform(col_data)
            # Create a DataFrame with the encoded data
            encoded_df = pd.DataFrame(encoded_data, index=df.index)
            # Rename columns to reflect the original column name
            encoded_df.columns = [f"{col}_encoded_{i}" for i in range(encoded_df.shape[1])]
            # Drop the original column and concatenate the encoded columns
            df = df.drop(columns=[col]).join(encoded_df)
    return df

# Apply the encoding to the DataFrame
final_merged_df = encode_categorical_columns(final_merged_df, categorical_columns)


#checking for consistency
for col in categorical_columns:
    encoded_cols = [c for c in final_merged_df.columns if c.startswith(f"{col}_encoded_")]
    if encoded_cols:
        print(f"Column '{col}' has been encoded into {len(encoded_cols)} columns.")
    else:
        print(f"Warning: Column '{col}' was not found or encoded.")

#review after encoding
# Display the first few rows of the encoded DataFrame
print(final_merged_df.head())


*Referential Integrity Checks*

In [None]:
# Example: Checking for orphaned 'product_id' entries
if 'product_id' in final_merged_df.columns:
    unique_product_ids = final_merged_df['product_id'].unique()
    # Assuming 'products_df' is the original products DataFrame
    missing_products = set(unique_product_ids) - set(filtered_dataframes['Products']['product_id'].unique())
    if missing_products:
        print(f"Warning: Found {len(missing_products)} 'product_id' entries without matching records in products data.")
    else:
        print("All 'product_id' entries have matching records in products data.")


In [None]:
# Example: Removing rows with missing 'product_id' references
if 'product_id' in final_merged_df.columns:
    final_merged_df = final_merged_df[final_merged_df['product_id'].isin(filtered_dataframes['Products']['product_id'])]
    print("Removed rows with orphaned 'product_id' references.")


*Statistical Summaries*

In [None]:
print("Descriptive statistics for numerical columns:")
print(final_merged_df.describe())

*Frequency Distribution*

In [None]:
import matplotlib.pyplot as plt

# Example: Plotting histograms for key numerical columns to visualize distributions
numerical_columns = final_merged_df.select_dtypes(include=['float64', 'int64']).columns

for col in numerical_columns:
    plt.figure(figsize=(8, 4))
    plt.hist(final_merged_df[col].dropna(), bins=30, edgecolor='k', alpha=0.7)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()


*Validate Business Logic*

In [None]:
final_merged_df.columns

**CAUTION -- Date Columns**

In [None]:
# Example: Checking logical date relationships
if 'order_approved_at' in final_merged_df.columns and 'order_delivered_customer_date' in final_merged_df.columns:
    invalid_dates = final_merged_df[final_merged_df['order_delivered_customer_date'] < final_merged_df['order_approved_at']]
    if not invalid_dates.empty:
        print(f"Warning: Found {invalid_dates.shape[0]} records with 'order_delivered_customer_date' earlier than 'order_approved_at'.")
    else:
        print("All date relationships are valid.")


In [None]:
# Example: Validating 'order_status' values
if 'order_status' in final_merged_df.columns:
    valid_statuses = ['delivered', 'shipped', 'canceled', 'processing']
    invalid_statuses = final_merged_df[~final_merged_df['order_status'].isin(valid_statuses)]
    if not invalid_statuses.empty:
        print(f"Warning: Found {invalid_statuses.shape[0]} entries with unexpected 'order_status' values.")
    else:
        print("All 'order_status' values are valid.")


**Saving the Final Dataset**

In [None]:
# Save the DataFrame to a CSV file
final_merged_df.to_csv('final_merged_df.csv', index=False)


In [None]:
final_merged_df.shape

In [None]:
#Dataset Basic Information

# Display the first few rows
print("First few rows of the dataset:")
print(final_merged_df.head())

# Display basic information
print("\nDataset Information:")
print(final_merged_df.info())

# Display summary statistics for numerical columns
print("\nSummary Statistics:")
print(final_merged_df.describe())

**Exploratory Data Analysis**

In [None]:
#Importing Libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
#Reading the final df CSV file

final_merged_df = pd.read_csv('/kaggle/input/final-merged-dataset/final_merged_df.csv')

In [None]:
#Dataset Basic Information

# Display the first few rows
print("First few rows of the dataset:")
print(final_merged_df.head())

# Display basic information
print("\nDataset Information:")
print(final_merged_df.info())

# Display summary statistics for numerical columns
print("\nSummary Statistics:")
print(final_merged_df.describe())


In [None]:
#Checking Missing Values


# Calculate the number and percentage of missing values per column
missing_values = final_merged_df.isnull().sum()
missing_percentage = (missing_values / final_merged_df.shape[0]) * 100

# Combine into a DataFrame for better readability
missing_data = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
})

# Filter columns with missing values
missing_data = missing_data[missing_data['Missing Values'] > 0]

print("\nMissing Values per Column:")
print(missing_data)


*Categorical*

In [None]:
#Distribution of Categorical Variables

# Identify categorical columns
categorical_columns = final_merged_df.select_dtypes(include=['object']).columns

# Display number of unique values in the categorical columns
for col in categorical_columns:
    unique_values = final_merged_df[col].nunique()
    print(f"'{col}' has {unique_values} unique categories.")

In [None]:
import matplotlib.pyplot as plt
import math

# List of columns to analyze
columns_to_analyze = ['order_status', 'payment_type', 'product_category_name_english', 'seller_city', 'seller_state']

# Define the number of top categories to display
top_n = 10

# Number of plots per row
plots_per_row = 2

# Calculate the number of rows needed
num_rows = math.ceil(len(columns_to_analyze) / plots_per_row)

# Create a figure with the calculated number of subplots
fig, axes = plt.subplots(num_rows, plots_per_row, figsize=(15, num_rows * 5))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Iterate over each specified column and corresponding subplot axis
for idx, col in enumerate(columns_to_analyze):
    if col in final_merged_df.columns:
        # Calculate the top N categories
        top_categories = final_merged_df[col].value_counts(dropna=False).nlargest(top_n)
        
        # Plot the distribution of the top N categories
        ax = axes[idx]
        top_categories.plot(kind='bar', ax=ax)
        ax.set_title(f'Top {top_n} Categories in {col}')
        ax.set_xlabel(col)
        ax.set_ylabel('Count')
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
        ax.grid(True)
    else:
        print(f"Column '{col}' not found in the DataFrame.")

# Remove any unused subplots
for j in range(idx + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()


*Numerical*

In [None]:
#Distribution of Numerical Variables

# Identify numerical columns
numerical_columns = final_merged_df.select_dtypes(include=['number']).columns

# Plot histograms for numerical columns
final_merged_df[numerical_columns].hist(figsize=(15, 10), bins=30, edgecolor='black')
plt.suptitle('Histograms of Numerical Features')
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import math

# Number of plots per row
plots_per_row = 2

# Filtered lists of columns for manageable plots
numerical_columns = final_merged_df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = [col for col in final_merged_df.select_dtypes(include=['object']).columns if final_merged_df[col].nunique() <= 20]

# Calculate the total number of plots needed
total_plots = len(numerical_columns) * len(categorical_columns)
num_rows = math.ceil(total_plots / plots_per_row)

# Set up figure with calculated number of rows
fig, axes = plt.subplots(num_rows, plots_per_row, figsize=(15, num_rows * 5))
axes = axes.flatten()  # Flatten axes for easier indexing

# Initialize plot index
plot_idx = 0

# Generate box plots
for num_col in numerical_columns:
    for cat_col in categorical_columns:
        if plot_idx < total_plots:
            sns.boxplot(data=final_merged_df, x=cat_col, y=num_col, ax=axes[plot_idx])
            axes[plot_idx].set_title(f'{num_col} Distribution by {cat_col}')
            axes[plot_idx].set_xlabel(cat_col)
            axes[plot_idx].set_ylabel(num_col)
            axes[plot_idx].tick_params(axis='x', rotation=45)
            axes[plot_idx].grid(True)
            plot_idx += 1

# Remove any unused subplots if total_plots < len(axes)
for j in range(plot_idx, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


*Correlation Matrix*

In [None]:
# Compute correlation matrix
correlation_matrix = final_merged_df[numerical_columns].corr()

# Plot heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Variables')
plt.show()


*Time Series Analysis*

In [None]:
#Analyse Trends over time

# Ensure 'order_purchase_timestamp' is in datetime format
final_merged_df['order_purchase_timestamp'] = pd.to_datetime(final_merged_df['order_purchase_timestamp'])

# Extract date-related features
final_merged_df['order_date'] = final_merged_df['order_purchase_timestamp'].dt.date
final_merged_df['order_month'] = final_merged_df['order_purchase_timestamp'].dt.to_period('M')

# Plot order counts over time
order_counts = final_merged_df['order_date'].value_counts().sort_index()
plt.figure(figsize=(12, 6))
order_counts.plot()
plt.title('Number of Orders Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Orders')
plt.grid(True)
plt.show()


In [None]:
# Ensure 'order_purchase_timestamp' is in datetime format
final_merged_df['order_purchase_timestamp'] = pd.to_datetime(final_merged_df['order_purchase_timestamp'])

# Set the order purchase timestamp as the index
time_series_df = final_merged_df.set_index('order_purchase_timestamp')

# Resample to get monthly order counts
monthly_orders = time_series_df['order_id'].resample('M').count()

# Plot the time series
plt.figure(figsize=(12, 6))
monthly_orders.plot()
plt.title('Monthly Order Counts between 2017-10 and 2018-07')
plt.xlabel('Date')
plt.ylabel('Number of Orders')
plt.grid(True)
plt.show()


*Pairwise Relationships*

In [None]:
# Select a subset of numerical columns for pairplot
subset_numerical_columns = numerical_columns[:5]  # Adjust the number as needed

# Plot pairplot
sns.pairplot(final_merged_df[subset_numerical_columns].dropna(), height=1.5)
plt.suptitle('Pairwise Relationships Between Numerical Variables', y=1.02)
plt.show()


*Geographical Trends* 

In [None]:
#Analysing Geographical Data

# Plot the distribution of orders by state
plt.figure(figsize=(12, 6))
final_merged_df['seller_state'].value_counts().plot(kind='bar')
plt.title('Number of Orders by Seller State')
plt.xlabel('State')
plt.ylabel('Number of Orders')
plt.xticks(rotation=45)
plt.show()


In [34]:
# Create the script file and write the content to it
script_content = """
#!/bin/bash

# Set Git user information
git config --global user.email "chennarajvamsi@outlook.com"
git config --global user.name "Rajvamsi Chenna"

# GitHub Username and Repository
GITHUB_USERNAME="rajvamsi18"
REPO_NAME="Real-Time-Pricing-Algorithm-Project"

# Get the GitHub Personal Access Token from Kaggle secrets
GITHUB_TOKEN=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

# Clone the repository (only if not already cloned)
if [ ! -d "$REPO_NAME" ]; then
    echo "Cloning the repository..."
    git clone https://$GITHUB_USERNAME:$GITHUB_TOKEN@github.com/$GITHUB_USERNAME/$REPO_NAME.git
fi

# Change to the repository directory
cd $REPO_NAME

# Pull the latest changes from GitHub
echo "Pulling the latest changes from GitHub..."
git pull origin main

# Stage all changes
echo "Staging changes..."
git add .

# Commit changes with a default or user-defined message
COMMIT_MESSAGE=${1:-"Automated commit from Kaggle"}
echo "Committing changes with message: $COMMIT_MESSAGE"
git commit -m "$COMMIT_MESSAGE"

# Push changes back to GitHub
echo "Pushing changes to GitHub..."
git push https://$GITHUB_USERNAME:$GITHUB_TOKEN@github.com/$GITHUB_USERNAME/$REPO_NAME.git
"""

# Write the script content to a file
with open("sync_to_github.sh", "w") as file:
    file.write(script_content)


In [35]:
!chmod +x sync_to_github.sh


In [36]:
!./sync_to_github.sh "Version 1 EDA"


Pulling the latest changes from GitHub...
From https://github.com/rajvamsi18/Real-Time-Pricing-Algorithm-Project
 * branch            main       -> FETCH_HEAD
Already up to date.
Staging changes...
Committing changes with message: Version 1 EDA
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
Pushing changes to GitHub...
Everything up-to-date


In [37]:
!ls Real-Time-Pricing-Algorithm-Project


README.md  data  notebooks


In [38]:
!mv real-time-pricing.ipynb Real-Time-Pricing-Algorithm-Project/



mv: cannot stat 'real-time-pricing.ipynb': No such file or directory


*Feature Engineering*