In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
# Load the CSV data
train_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
train_data.head()

In [None]:
# Get Statistical details
train_data.describe().transpose()

In [None]:
train_data.info()

In [None]:
# Distribution of Label Data
train_data['sii'].value_counts()

In [None]:
# Selecting columns with more than 50% non-null values and filling missing values
threshold = 0.5 * len(train_data)
columns_with_data = train_data.columns[train_data.isnull().sum() < threshold]
train_data = train_data[columns_with_data]
# Replace all missing values with 0
train_data = train_data.fillna(0)

In [None]:
# Define the target column
target_column = 'sii'
# Remove rows where the target column 'sii' is NaN
train_data_cleaned = train_data.dropna(subset=[target_column])
# Check the results
train_data_cleaned.head()
train_data_cleaned.info()

In [None]:
# Categorical columns in the dataset
categorical_columns = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
                       'FGC-Season', 'BIA-Season', 'PCIAT-Season', 'SDS-Season', 'PreInt_EduHx-Season']
# Plotting boxplots for 'sii' against each categorical column
plt.figure(figsize=(16, 24))
for i, col in enumerate(categorical_columns, 1):
    plt.subplot(4, 2, i)  # 4 rows, 2 columns, plot i
    sns.boxplot(x=col, y='sii', data=train_data_cleaned)
    plt.xticks(rotation=45)
    plt.title(f"'sii' vs {col}")
plt.tight_layout()
plt.show()

In [None]:
# Plot target column 'sii' with numerical columns
numerical_cols = train_data_cleaned.select_dtypes(include=['float64', 'int64']).columns
# Set the number of plots per row
plots_per_row = 5
n_rows = (len(numerical_cols) + plots_per_row - 1) // plots_per_row
plt.figure(figsize=(20, 4 * n_rows))
for i, col in enumerate(numerical_cols):
    plt.subplot(n_rows, plots_per_row, i + 1)
    sns.boxplot(x='sii', y=col, data=train_data_cleaned)
    plt.title(col)
    plt.tight_layout()
plt.show()

In [None]:
# Identify categorical columns for seasons
season_cols = [
    'Basic_Demos-Enroll_Season', 
    'CGAS-Season', 
    'Physical-Season', 
    'FGC-Season', 
    'BIA-Season', 
    'PCIAT-Season', 
    'SDS-Season', 
    'PreInt_EduHx-Season'
]
# Create a mapping dictionary for seasons
season_mapping = {
    'Spring': 0,
    'Summer': 1,
    'Fall': 2,
    'Winter': 3
}
# Apply manual encoding to the categorical columns
for col in season_cols:
    if col in train_data_cleaned.columns:
        train_data_cleaned[col] = train_data_cleaned[col].replace(season_mapping)

In [None]:
# Drop the 'id' column if present
train_data_no_id = train_data_cleaned.drop(columns=['id'], errors='ignore')
# Calculate the correlation matrix
correlation_matrix = train_data_no_id.corr()
# Plot the heatmap
plt.figure(figsize=(30, 30))
sns.heatmap(correlation_matrix, annot=True, fmt='.1f', cmap='coolwarm', square=True)
plt.title('Correlation Heatmap')
plt.show()