In [69]:
import pandas as pd
import numpy as np
from scipy import stats

In [71]:
# Load data from a CSV file
df = pd.read_csv('file.csv')

In [73]:
# Inspect the data
print("Original DataFrame:")
print(df.head())

Original DataFrame:
    User ID country  Gender   Age   salary  Purchased
0  15624510   India    Male  19.0  19000.0          0
1  15810944     USA    Male  35.0      NaN          1
2  15668575  France  Female  26.0  43000.0          0
3  15603246     USA  Female   NaN  57000.0          0
4  15804002  France    Male  19.0  76000.0          0


In [75]:
print("\nDataFrame Info:")
print(df.info())


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   User ID    20 non-null     int64  
 1   country    19 non-null     object 
 2   Gender     20 non-null     object 
 3   Age        17 non-null     float64
 4   salary     19 non-null     float64
 5   Purchased  20 non-null     int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 1.1+ KB
None


In [77]:
print("\nStatistical Summary:")
print(df.describe())


Statistical Summary:
            User ID        Age         salary  Purchased
count  2.000000e+01  17.000000      19.000000  20.000000
mean   1.567881e+07  29.529412   57368.421053   0.400000
std    6.987218e+04   9.348246   33128.052105   0.502625
min    1.557077e+07  18.000000   18000.000000   0.000000
25%    1.561468e+07  25.000000   28000.000000   0.000000
50%    1.569626e+07  27.000000   57000.000000   0.000000
75%    1.572768e+07  35.000000   80000.000000   1.000000
max    1.581094e+07  47.000000  150000.000000   1.000000


In [79]:
# Handling Missing Values
print("\nMissing Values in Each Column:")
print(df.isnull().sum())


Missing Values in Each Column:
User ID      0
country      1
Gender       0
Age          3
salary       1
Purchased    0
dtype: int64


In [81]:
# Drop rows with any missing values (optional)
# df_cleaned = df.dropna()

# Fill missing values with a specific value (e.g., 0)
df.fillna(value=0, inplace=True)

In [83]:
# Fill missing values with the mean of the column for a specific column
# df['column_name'].fillna(df['column_name'].mean(), inplace=True)
# Removing Duplicates
print("\nNumber of Duplicate Rows:", df.duplicated().sum())
df_cleaned = df.drop_duplicates()


Number of Duplicate Rows: 0


In [87]:
# Changing Data Types
# Convert a column to a specific data type (e.g., integer)
# df_cleaned['column_name'] = df_cleaned['column_name'].astype(int)

# Convert a string column to datetime
# df_cleaned['date_column'] = pd.to_datetime(df_cleaned['date_column'])

# Renaming Columns
df_cleaned.rename(columns={'country': 'Country'}, inplace=True)


In [89]:
# Filtering Rows
# Filter rows where 'column_name' > threshold_value
# df_filtered = df_cleaned[df_cleaned['column_name'] > threshold_value]

# Standardizing Text Data
# Convert a text column to lowercase
df_cleaned['Gender'] = df_cleaned['Gender'].str.lower()
# Remove leading and trailing whitespace
df_cleaned['Gender'] = df_cleaned['Gender'].str.strip()
# Replace specific strings
# df_cleaned['text_column'] = df_cleaned['text_column'].str.replace('old_value', 'new_value')

In [95]:
# Binning Continuous Data
# Bin a continuous variable into categories
# bins = [0, 50, 100]
# labels = ['Low', 'High']
# df_cleaned['binned_column'] = pd.cut(df_cleaned['continuous_column'], bins=bins, labels=labels)

# Outlier Detection and Removal
df_cleaned = df_cleaned[(np.abs(stats.zscore(df_cleaned['Age'])) < 3)]

In [91]:
# Final Check
print("\nCleaned DataFrame Info:")
print(df_cleaned.info())
print("\nCleaned DataFrame:")
print(df_cleaned.head())


Cleaned DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   User ID    20 non-null     int64  
 1   Country    20 non-null     object 
 2   Gender     20 non-null     object 
 3   Age        20 non-null     float64
 4   salary     20 non-null     float64
 5   Purchased  20 non-null     int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 1.1+ KB
None

Cleaned DataFrame:
    User ID Country  Gender   Age   salary  Purchased
0  15624510   India    male  19.0  19000.0          0
1  15810944     USA    male  35.0      0.0          1
2  15668575  France  female  26.0  43000.0          0
3  15603246     USA  female   0.0  57000.0          0
4  15804002  France    male  19.0  76000.0          0


In [93]:
# Export the cleaned DataFrame to a new CSV file
df_cleaned.to_csv('cleaned_data.csv', index=False)