In [25]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px

In [1]:
import pandas as pd

# File path
file_path = r"C:\Users\iamus\Downloads\sample_submission.csv"

# Load CSV into a DataFrame
df = pd.read_csv(file_path)

# Show first 5 rows
print(df.head())


       id  exam_score
0  630000           0
1  630001           0
2  630002           0
3  630003           0
4  630004           0


In [2]:
# Show basic info about the dataset
print(df.info())

# Show basic statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270000 entries, 0 to 269999
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   id          270000 non-null  int64
 1   exam_score  270000 non-null  int64
dtypes: int64(2)
memory usage: 4.1 MB
None
                  id  exam_score
count  270000.000000    270000.0
mean   764999.500000         0.0
std     77942.430678         0.0
min    630000.000000         0.0
25%    697499.750000         0.0
50%    764999.500000         0.0
75%    832499.250000         0.0
max    899999.000000         0.0
id            0
exam_score    0
dtype: int64


In [4]:
# Load train and test data
train = pd.read_csv(r"C:\Users\iamus\Downloads\train.csv")
test = pd.read_csv(r"C:\Users\iamus\Downloads\test.csv")

# Check first 5 rows
print(train.head())
print(test.head())


   id  age  gender   course  study_hours  class_attendance internet_access  \
0   0   21  female     b.sc         7.91              98.8              no   
1   1   18   other  diploma         4.95              94.8             yes   
2   2   20  female     b.sc         4.68              92.6             yes   
3   3   19    male     b.sc         2.00              49.5             yes   
4   4   23    male      bca         7.65              86.9             yes   

   sleep_hours sleep_quality   study_method facility_rating exam_difficulty  \
0          4.9       average  online videos             low            easy   
1          4.7          poor     self-study          medium        moderate   
2          5.8          poor       coaching            high        moderate   
3          8.3       average    group study            high        moderate   
4          9.6          good     self-study            high            easy   

   exam_score  
0        78.3  
1        46.7  
2       

# ‚≠ê: Features (X) and target (y)

In [6]:
# Separate target
y = train['exam_score']

# Drop target from features
X = train.drop('exam_score', axis=1)


# ‚≠ê:Combine Train & Test (for Encoding)

In [9]:
# Combine train & test for encoding
full_data = pd.concat([X, test], axis=0)
print(full_data.head(10))

   id  age  gender   course  study_hours  class_attendance internet_access  \
0   0   21  female     b.sc         7.91              98.8              no   
1   1   18   other  diploma         4.95              94.8             yes   
2   2   20  female     b.sc         4.68              92.6             yes   
3   3   19    male     b.sc         2.00              49.5             yes   
4   4   23    male      bca         7.65              86.9             yes   
5   5   24    male    b.com         5.04              85.1             yes   
6   6   20    male     b.sc         4.28              87.0              no   
7   7   22  female       ba         4.19              44.9             yes   
8   8   22   other    b.com         1.06              98.3             yes   
9   9   18    male      bba         3.44              80.9             yes   

   sleep_hours sleep_quality   study_method facility_rating exam_difficulty  
0          4.9       average  online videos             low    

# ‚≠ê:Identify Numerical & Categorical Columns

In [11]:
# Numerical columns
numerical_cols = full_data.select_dtypes(include=['int64', 'float64']).columns

# Categorical columns
categorical_cols = full_data.select_dtypes(include=['object', 'category', 'bool']).columns

print("üìä Numerical Columns:")
print(numerical_cols)

print("\nüè∑Ô∏è Categorical Columns:")
print(categorical_cols)


üìä Numerical Columns:
Index(['id', 'age', 'study_hours', 'class_attendance', 'sleep_hours'], dtype='object')

üè∑Ô∏è Categorical Columns:
Index(['gender', 'course', 'internet_access', 'sleep_quality', 'study_method',
       'facility_rating', 'exam_difficulty'],
      dtype='object')


# ‚≠ê:Drop id

In [12]:
X = train.drop(['exam_score', 'id'], axis=1)
y = train['exam_score']

X_test = test.drop(['id'], axis=1)


# ‚≠ê:Convert Categorical Columns (One-Hot Encoding)

In [13]:
# Combine train and test for consistent encoding
full_data = pd.concat([X, X_test], axis=0)

# One-hot encoding
full_data = pd.get_dummies(full_data)


# ‚≠ê:Split Back

In [14]:
X = full_data.iloc[:len(train), :]
X_test = full_data.iloc[len(train):, :]


# ‚≠ê: Train a Strong Model (Random Forest)

In [17]:
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [18]:
model = RandomForestRegressor(
    n_estimators=200,       # Number of trees
    max_depth=None,         # No max depth (full growth)
    min_samples_split=2,    # Minimum samples to split
    min_samples_leaf=1,     # Minimum samples per leaf
    random_state=42,
    n_jobs=-1               # Use all CPU cores
)

In [19]:
model.fit(X_train_split, y_train_split)

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [20]:
val_preds = model.predict(X_val)

In [21]:
# For older scikit-learn versions, compute RMSE manually
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print("Validation RMSE:", rmse)

Validation RMSE: 9.0678363997416


In [26]:
# Create a DataFrame of feature importances
importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Interactive bar chart with Plotly
fig = px.bar(
    importance,
    x='Importance',
    y='Feature',
    orientation='h',  # horizontal bar chart
    text='Importance',  # show values on bars
    title='Feature Importance',
    labels={'Importance': 'Importance Score', 'Feature': 'Features'},
    height=600
)

fig.update_layout(
    yaxis=dict(autorange='reversed'),  # largest importance at top
    template='plotly_white',
    title_font=dict(size=22),
    xaxis_title_font=dict(size=18),
    yaxis_title_font=dict(size=18)
)

fig.show()


# ‚≠ê: Predict & Submission

In [23]:
test_preds = model.predict(X_test)

submission = pd.DataFrame({
    'id': test['id'],
    'exam_score': test_preds
})

submission.to_csv("submission.csv", index=False)
print("Submission file ready!")


Submission file ready!


# ‚≠ê:Train Final Model on Full Train Data

In [24]:
# Train on full dataset
final_model = RandomForestRegressor(
    n_estimators=300,  # More trees can improve accuracy
    random_state=42,
    n_jobs=-1
)

final_model.fit(X, y)


0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [27]:
# Make predictions
test_preds = final_model.predict(X_test)


In [28]:
submission = pd.DataFrame({
    'id': test['id'],
    'exam_score': test_preds
})

submission.to_csv("final_submission.csv", index=False)
print("final_submission.csv is ready to upload to Kaggle!")


final_submission.csv is ready to upload to Kaggle!
