# 1. Cleaning and preparing data:
   a. Clean the data of errors and inconsistencies.
   b. Deal with missing values and outliers.
   c. Transform the data into a format suitable for analysis and processing.


In [None]:
import pandas as pd
import numpy as np

train_data = pd.read_csv('/mnt/data/train_data.csv')
test_data = pd.read_csv('/mnt/data/test_data.csv')
test_data_hidden = pd.read_csv('/mnt/data/test_data_hidden.csv')

print(train_data.head())
print(test_data.head())
print(test_data_hidden.head())

# filling missing values with a placeholder for all columns
train_data.fillna("Unknown", inplace=True)
test_data.fillna("Unknown", inplace=True)
test_data_hidden.fillna("Unknown", inplace=True)

# converting 'reviews.date' to datetime format
train_data['reviews.date'] = pd.to_datetime(train_data['reviews.date'], errors='coerce')
test_data['reviews.date'] = pd.to_datetime(test_data['reviews.date'], errors='coerce')
test_data_hidden['reviews.date'] = pd.to_datetime(test_data_hidden['reviews.date'], errors='coerce')

# checking for data type inconsistencies 
print(train_data.info())
print(test_data.info())
print(test_data_hidden.info())

# saving the cleaned data
train_data.to_csv('/mnt/data/train_data_cleaned.csv', index=False)
test_data.to_csv('/mnt/data/test_data_cleaned.csv', index=False)
test_data_hidden.to_csv('/mnt/data/test_data_hidden_cleaned.csv', index=False)


In [None]:
# 2. Conducting exploratory data analysis (EDA)
a. Explore the data to understand its distribution, the relationships between variables, and
note any outliers.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Sentiment Distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=train_data, x='sentiment')
plt.title('Sentiment Distribution in Train Data')
plt.show()

# Reviews length analysis
train_data['review_length'] = train_data['reviews.text'].apply(len)
plt.figure(figsize=(8, 6))
sns.histplot(train_data['review_length'], bins=50)
plt.title('Review Text Length Distribution')
plt.show()

# Analyze the distribution of review dates
plt.figure(figsize=(10, 6))
train_data['reviews.date'].value_counts().sort_index().plot()
plt.title('Review Dates Distribution')
plt.xlabel('Date')
plt.ylabel('Number of Reviews')
plt.show()