# Imbalance Data

![image-15.png](attachment:image-15.png)

# Load Dataset

In [2]:
import pandas as pd 
df = pd.read_csv("student-scores.csv")
df

Unnamed: 0,id,first_name,last_name,email,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,1,Paul,Casey,paul.casey.1@gslingacademy.com,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87
1,2,Danielle,Sandoval,danielle.sandoval.2@gslingacademy.com,female,False,2,False,47,Doctor,90,86,96,100,90,88,90
2,3,Tina,Andrews,tina.andrews.3@gslingacademy.com,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94
3,4,Tara,Clark,tara.clark.4@gslingacademy.com,female,False,5,False,3,Artist,71,74,88,80,89,63,86
4,5,Anthony,Campos,anthony.campos.5@gslingacademy.com,male,False,5,False,10,Unknown,84,77,65,65,80,74,76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,Alan,Reynolds,alan.reynolds.1996@gslingacademy.com,male,False,2,False,30,Construction Engineer,83,77,84,73,75,84,82
1996,1997,Thomas,Gilbert,thomas.gilbert.1997@gslingacademy.com,male,False,2,False,20,Software Engineer,89,65,73,80,87,67,73
1997,1998,Madison,Cross,madison.cross.1998@gslingacademy.com,female,False,5,False,14,Software Engineer,97,85,63,93,68,94,78
1998,1999,Brittany,Compton,brittany.compton.1999@gslingacademy.com,female,True,10,True,5,Business Owner,51,96,72,89,95,88,75


# 1 Data Preprocessing Step

# Encoding Categorical Column

In [4]:
from sklearn.preprocessing import OrdinalEncoder

# Select categorical columns to encode
categorical_cols = ['fuel', 'seller_type', 'transmission','owner']

# Create OrdinalEncoder instance
encoder = OrdinalEncoder()

# Apply encoding
df[categorical_cols] = encoder.fit_transform(df[categorical_cols]).astype(int)

df

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,4,1,1,0
1,2007,135000,50000,4,1,1,0
2,2012,600000,100000,1,1,1,0
3,2017,250000,46000,4,1,1,0
4,2014,450000,141000,1,1,1,2
...,...,...,...,...,...,...,...
4335,2014,409999,80000,1,1,1,2
4336,2014,409999,80000,1,1,1,2
4337,2009,110000,83000,4,1,1,2
4338,2016,865000,90000,1,1,1,0


# Scalling: Standarizatoin (standardscaler)

In [6]:
from sklearn.preprocessing import StandardScaler


# Select numerical columns to normalize
numerical_cols = ['year', 'km_driven']

# Initialize scaler
scaler = StandardScaler()

# Fit-transform on full data (only on numerical columns)
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
df

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,-1.445074,60000,0.081139,4,1,1,0
1,-1.445074,135000,-0.347689,4,1,1,0
2,-0.258795,600000,0.724381,1,1,1,0
3,0.927485,250000,-0.433455,4,1,1,0
4,0.215717,450000,1.603479,1,1,1,2
...,...,...,...,...,...,...,...
4335,0.215717,409999,0.295553,1,1,1,2
4336,0.215717,409999,0.295553,1,1,1,2
4337,-0.970563,110000,0.359877,4,1,1,2
4338,0.690229,865000,0.509967,1,1,1,0


# Train Test Split

In [7]:
from sklearn.model_selection import train_test_split

X = df.drop('selling_price', axis=1)  # Features
y = df['selling_price']              # Target

# Train-Test Split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check shape
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (3472, 6)
Test shape: (868, 6)


# Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# R² Score
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.2f}")


R² Score: 0.39


# RandomForest 

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Initialize the model
model = RandomForestRegressor(random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# R² Score
r2 = r2_score(y_test, y_pred)
print(f"R² Score (Random Forest Regressor): {r2:.2f}")


R² Score (Random Forest Regressor): 0.50
