In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import os
def load_data(file_path):
    print("Loading data...")
    df = pd.read_csv(file_path)
    print(f"Loaded {df.shape[0]} rows and {df.shape[1]} columns.")
    return df
def preprocess_data(df):
    print("Starting preprocessing...")
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()

    print(f"Numeric columns: {num_cols}")
    print(f"Categorical columns: {cat_cols}")
    num_imputer = SimpleImputer(strategy='mean')
    df[num_cols] = num_imputer.fit_transform(df[num_cols])

    cat_imputer = SimpleImputer(strategy='most_frequent')
    df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
    for col in cat_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])

    return df
def save_processed_data(df, output_path):
    print(f"Saving processed data to {output_path}")
    df.to_csv(output_path, index=False)
def run_pipeline(input_file, output_file):
    df = load_data(input_file)
    df_clean = preprocess_data(df)
    save_processed_data(df_clean, output_file)
    print("ETL complete!")
if __name__ == "__main__":
    input_file = "ODI Cricket Data.csv"
    output_file = "processed_odi_cricket_data.csv"
    run_pipeline(input_file, output_file)


Loading data...
Loaded 1540 rows and 17 columns.
Starting preprocessing...
Numeric columns: ['total_runs', 'total_balls_faced', 'total_wickets_taken', 'total_runs_conceded', 'total_overs_bowled', 'total_matches_played', 'matches_played_as_batter', 'matches_played_as_bowler', 'matches_won', 'matches_lost', 'player_of_match_awards']
Categorical columns: ['player_name', 'role', 'strike_rate', 'team', 'average', 'percentage']
Saving processed data to processed_odi_cricket_data.csv
ETL complete!


In [4]:
import pandas as pd
df = pd.read_csv("ODI Cricket Data.csv")
first_10_rows = df.head(10)
print(first_10_rows)


        player_name    role  total_runs            strike_rate  \
0           V Kohli  Batter       13784  9.170.381.212.161.530   
1     KC Sangakkara  Batter       11618  7.939.046.057.127.230   
2         RG Sharma  Batter       10646  9.035.817.348.497.700   
3          MS Dhoni  Batter       10274  8.497.932.175.351.530   
4    AB de Villiers  Batter        9435  9.944.139.966.273.180   
5        TM Dilshan  Batter        9212  8.547.833.348.798.360   
6       LRPL Taylor  Batter        8126  8.093.625.498.007.960   
7  DPMD Jayawardene  Batter        8040  7.857.701.329.163.400   
8           HM Amla  Batter        7834  8.648.708.324.133.360   
9       Tamim Iqbal  Batter        7648    766.409.459.865.718   

   total_balls_faced  total_wickets_taken  total_runs_conceded  \
0              15031                    7                  681   
1              14634                    0                    0   
2              11782                   11                  538   
3        