In [19]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the TSV files
train_file_path = r'F:\moVies\Springboard\data\ghc_train.tsv'
test_file_path = r'F:\moVies\Springboard\data\ghc_test.tsv'

train_df = pd.read_csv(train_file_path, sep='\t')
test_df = pd.read_csv(test_file_path, sep='\t')

# Display the first few rows of the data
print("Train DataFrame Head:")
display(train_df.head())
print("Test DataFrame Head:")
display(test_df.head())

# Inspect the data
print("Train DataFrame Info:")
train_df.info()
print("Train DataFrame Description:")
display(train_df.describe())

print("Test DataFrame Info:")
test_df.info()
print("Test DataFrame Description:")
display(test_df.describe())

# Handle missing values
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)
# Or fill missing values (example: fill with mean for numerical columns)
# train_df.fillna(train_df.mean(), inplace=True)
# test_df.fillna(test_df.mean(), inplace=True)

# Remove duplicates
train_df.drop_duplicates(inplace=True)
test_df.drop_duplicates(inplace=True)

# Convert data types if necessary
# train_df['date_column'] = pd.to_datetime(train_df['date_column'])
# test_df['date_column'] = pd.to_datetime(test_df['date_column'])

# Normalize/Standardize data if necessary
numerical_columns = ['hd', 'cv', 'vo']  # Replace with actual numerical column names
scaler = StandardScaler()
train_df[numerical_columns] = scaler.fit_transform(train_df[numerical_columns])
test_df[numerical_columns] = scaler.transform(test_df[numerical_columns])

# Handle outliers (example for one numerical column)
numerical_column = 'hd'  # Replace with actual numerical column name
Q1 = train_df[numerical_column].quantile(0.25)
Q3 = train_df[numerical_column].quantile(0.75)
IQR = Q3 - Q1
train_df = train_df[~((train_df[numerical_column] < (Q1 - 1.5 * IQR)) | (train_df[numerical_column] > (Q3 + 1.5 * IQR)))]

Q1_test = test_df[numerical_column].quantile(0.25)
Q3_test = test_df[numerical_column].quantile(0.75)
IQR_test = Q3_test - Q1_test
test_df = test_df[~((test_df[numerical_column] < (Q1_test - 1.5 * IQR_test)) | (test_df[numerical_column] > (Q3_test + 1.5 * IQR_test)))]

# Encode categorical data
categorical_columns = ['text']  # Replace with actual categorical column names
train_df = pd.get_dummies(train_df, columns=categorical_columns)
test_df = pd.get_dummies(test_df, columns=categorical_columns)

# Ensure train and test dataframes have the same columns after one-hot encoding
train_df, test_df = train_df.align(test_df, join='inner', axis=1)

# Save cleaned data
train_df.to_csv('cleaned_train.tsv', sep='\t', index=False)
test_df.to_csv('cleaned_test.tsv', sep='\t', index=False)


Train DataFrame Head:


Unnamed: 0,text,hd,cv,vo
0,He most likely converted to islam due to his n...,0,0,0
1,So Ford lied about being a psychologist. Recor...,0,0,0
2,Jobs. Education. Ending abuse of Nation. CA43.,0,0,0
3,"I share a lot of your values, & like many who ...",0,0,0
4,I am so ready to get back to blogging! www.ben...,0,0,0


Test DataFrame Head:


Unnamed: 0,text,hd,cv,vo
0,https://www.youtube.com/watch?v=kACWpKAKtak A ...,0,0,0
1,Very nice! I tend to get tired of the constant...,0,0,0
2,Watch today. https://circumcisionmovie.com/,0,0,0
3,""" Thinking Venues "" First Color Layer blocking...",0,0,0
4,What about death penalty for perpetrators and...,0,0,0


Train DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22036 entries, 0 to 22035
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    22036 non-null  object
 1   hd      22036 non-null  int64 
 2   cv      22036 non-null  int64 
 3   vo      22036 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 688.8+ KB
Train DataFrame Description:


Unnamed: 0,hd,cv,vo
count,22036.0,22036.0,22036.0
mean,0.084271,0.005945,0.062579
std,0.2778,0.076875,0.24221
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,1.0


Test DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5510 entries, 0 to 5509
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5510 non-null   object
 1   hd      5510 non-null   int64 
 2   cv      5510 non-null   int64 
 3   vo      5510 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 172.3+ KB
Test DataFrame Description:


Unnamed: 0,hd,cv,vo
count,5510.0,5510.0,5510.0
mean,0.089111,0.004356,0.066969
std,0.284929,0.06586,0.249991
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,1.0
