In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the CSV file
file_path = "/content/Accumulative_distribution.csv"
df = pd.read_csv(file_path)

In [3]:
# Display basic info and first few rows
print("Initial Data Info:")
print(df.info())
print(df.head())

Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12160 entries, 0 to 12159
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID            12160 non-null  int64  
 1   Type          12160 non-null  object 
 2   Movie_number  12160 non-null  int64  
 3   Fly_number    12160 non-null  int64  
 4   Other_number  12160 non-null  int64  
 5   Difference_x  12160 non-null  float64
 6   Difference_y  12160 non-null  float64
 7   Distance      12160 non-null  float64
dtypes: float64(3), int64(4), object(1)
memory usage: 760.1+ KB
None
   ID           Type  Movie_number  Fly_number  Other_number  Difference_x  \
0   1  Dmelanogaster             1           1             0     -9.713147   
1   2  Dmelanogaster             1           2             0    -23.854249   
2   3  Dmelanogaster             1           3             0    -59.388140   
3   4  Dmelanogaster             1           4             0  

In [4]:
# Handling Missing Values - Data Imputation
# Impute numerical columns with median
num_imputer = SimpleImputer(strategy='median')
# Impute categorical columns with most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')

In [5]:
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(include=['object']).columns

In [6]:
df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

In [7]:
# Removing Duplicates
df.drop_duplicates(inplace=True)

In [8]:
# Handling Outliers (Optional)
for col in num_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])

In [9]:
# Standardizing Numerical Columns (Optional)
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [11]:
# Save the cleaned dataset
cleaned_file_path = "Cleaned_Accumulative_distribution.csv"
df.to_csv(cleaned_file_path, index=False)
print(f"Cleaned data saved at: {cleaned_file_path}")

Cleaned data saved at: Cleaned_Accumulative_distribution.csv


In [13]:
df_original = pd.read_csv(file_path)

In [14]:
# Compare original and cleaned data
print("Comparison between Original and Cleaned Data:")
print("Missing Values Before Cleaning:")
print(df_original.isnull().sum())
print("Missing Values After Cleaning:")
print(df.isnull().sum())

Comparison between Original and Cleaned Data:
Missing Values Before Cleaning:
ID              0
Type            0
Movie_number    0
Fly_number      0
Other_number    0
Difference_x    0
Difference_y    0
Distance        0
dtype: int64
Missing Values After Cleaning:
ID              0
Type            0
Movie_number    0
Fly_number      0
Other_number    0
Difference_x    0
Difference_y    0
Distance        0
dtype: int64


In [15]:
print("Duplicate Rows Before Cleaning:", df_original.duplicated().sum())
print("Duplicate Rows After Cleaning:", df.duplicated().sum())

Duplicate Rows Before Cleaning: 0
Duplicate Rows After Cleaning: 0


In [16]:
print("Summary Statistics Before Cleaning:")
print(df_original.describe())
print("Summary Statistics After Cleaning:")
print(df.describe())

Summary Statistics Before Cleaning:
                 ID  Movie_number    Fly_number  Other_number  Difference_x  \
count  12160.000000  12160.000000  12160.000000  12160.000000  1.216000e+04   
mean    6080.500000      5.906250      9.500000      9.500000 -2.804774e-17   
std     3510.433971      3.185639      5.766518      5.766518  3.647669e+01   
min        1.000000      1.000000      0.000000      0.000000 -1.113577e+02   
25%     3040.750000      3.000000      4.750000      4.750000 -1.703750e+01   
50%     6080.500000      6.000000      9.500000      9.500000  0.000000e+00   
75%     9120.250000      8.250000     14.250000     14.250000  1.703750e+01   
max    12160.000000     12.000000     19.000000     19.000000  1.113577e+02   

       Difference_y      Distance  
count  1.216000e+04  12160.000000  
mean  -2.337312e-17     42.227294  
std    3.870810e+01     32.335356  
min   -1.140549e+02      0.993842  
25%   -1.833235e+01      8.792832  
50%    0.000000e+00     39.103716  
