## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [18]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
# Modelling
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
import warnings

#### Import the CSV Data as Pandas DataFrame

In [19]:
df = pd.read_csv('stud.csv')

In [20]:
categorical_data = [i for i in df.columns if df[i].dtype=="object"]
categorical_data

['gender',
 'race_ethnicity',
 'parental_level_of_education',
 'lunch',
 'test_preparation_course']

In [21]:
numerical_data = [i for i in df.columns if df[i].dtype!="object"]
numerical_data

['math_score', 'reading_score', 'writing_score']

In [22]:
continuous_numerical_data = [i for i in numerical_data if len(df[i].unique())>=16]
continuous_numerical_data

['math_score', 'reading_score', 'writing_score']

In [23]:
# We will encode the categorical data using label Encoder.
le = preprocessing.LabelEncoder()

for i in categorical_data:
    df[i] = le.fit_transform(df[i])

In [24]:
# Log Transformation
for i in ["reading_score", "writing_score"]:
    df[i] = np.log(df[i])

#### Preparing X and Y variables

In [25]:
X = df.drop(columns = "math_score")
y = df["math_score"]

In [26]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
X_train.shape, X_test.shape

((700, 7), (300, 7))

#### Create an Evaluate Function to give all metrics after model Training

In [27]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [28]:
models = {
    "Random Forest Regressor": RandomForestRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))
    print()
    print('----------------------------------')
    print()
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 2.2929
- Mean Absolute Error: 1.8282
- R2 Score: 0.9762

----------------------------------

Model performance for Test set
- Root Mean Squared Error: 6.1908
- Mean Absolute Error: 4.8760
- R2 Score: 0.8460




### Results

In [29]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
0,Random Forest Regressor,0.845961


## Random Forest Regressor

In [30]:
# Import the RandomForestRegressor class from the ensemble module
from sklearn.ensemble import RandomForestRegressor

# Instantiate the RandomForestRegressor with default hyperparameters
model = RandomForestRegressor(random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_rf = model.predict(X_test)

# Calculate R-squared score
score_rf = r2_score(y_test, y_pred_rf) * 100

print("Accuracy of the Random Forest model is %.2f" % score_rf)


Accuracy of the Random Forest model is 84.44


In [31]:
score = cross_val_score(model, X, y, cv=5)
print("Cross validation is",np.mean(score)*100)

Cross validation is 83.62571898492392


In [32]:
# Create a dictionary with values for each feature
new_data = {
    'gender': 1,  # Example: 1 for male, 0 for female
    'race_ethnicity': 2,  # Example: 0 for group A, 1 for group B, and so on
    'parental_level_of_education': 3,  # Example: 0 for bachelor's degree, 1 for some college, and so on
    'lunch': 1,  # Example: 1 for standard, 0 for free/reduced
    'test_preparation_course': 0,  # Example: 1 for completed, 0 for none
    'reading_score': 85,  # Example: 85
    'writing_score': 90  # Example: 90
}

# Convert the dictionary to a DataFrame
new_df = pd.DataFrame([new_data])

# Make prediction using the trained Random Forest model
predicted_math_score = model.predict(new_df)

print("Predicted math score:", predicted_math_score[0])


Predicted math score: 94.04309523809523


In [33]:
import pickle
# open a file, where you want to store the data
file = open('random-forest.pkl', 'wb')

# dump information to that file
pickle.dump(model, file)

In [34]:
# import joblib

# le = {}
# for col in ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']:
#     le[col] = preprocessing.LabelEncoder()
#     df[col] = le[col].fit_transform(df[col])
    
# joblib.dump(le, 'label_encoders.joblib')


In [35]:
# import pandas as pd
# import numpy as np
# from sklearn import preprocessing
# from sklearn.ensemble import RandomForestRegressor
# import joblib
# import pickle

# # Load your dataset
# df = pd.read_csv('stud.csv')

# # Encode categorical data
# le = {}
# categorical_data = ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
# for col in categorical_data:
#     le[col] = preprocessing.LabelEncoder()
#     df[col] = le[col].fit_transform(df[col])

# # Save the label encoders
# joblib.dump(le, 'label_encoders.joblib')

# # Log Transformation
# for col in ["reading_score", "writing_score"]:
#     df[col] = np.log(df[col])

# # Prepare X and Y variables
# X = df.drop(columns="math_score")
# y = df["math_score"]

# # Train the RandomForestRegressor model
# model = RandomForestRegressor(random_state=42)
# model.fit(X_train, y_train)

# # Save the trained model
# with open('model.pkl', 'wb') as model_file:
#     pickle.dump(model, model_file)
