## ML Modeling (OLS & LR)

In [1]:
from preprocessing_utils import preprocessing
from ModularML_new import train_model
from ModularML_new import get_predictions
from ModularML_new import save_model_to_pickle


## Creating a dataframe of 1000 records

In [2]:
import pandas as pd
import numpy as np
## Created a random dataframe.
# Generating sample data
np.random.seed(42)
num_records = 1000  # Update to 1000 for 1000 records

data = {
    'age': np.random.randint(22, 60, size=num_records),
    'experience': np.random.randint(1, 20, size=num_records),
    'completed_trainings': np.random.randint(0, 10, size=num_records),
    'expired_trainings': np.random.randint(0, 5, size=num_records),
    'total_trainings': np.random.randint(5, 15, size=num_records),
    'gender': np.random.choice(['Male', 'Female'], size=num_records)
}

# Creating the DataFrame
df = pd.DataFrame(data)


In [3]:
df.head(5)

Unnamed: 0,age,experience,completed_trainings,expired_trainings,total_trainings,gender
0,50,17,6,4,5,Male
1,36,13,3,3,9,Female
2,29,16,8,4,11,Female
3,42,8,4,0,12,Male
4,40,19,2,4,14,Female


### Calculating training completion rate

In [4]:
df["completion_rate"] = df["completed_trainings"] / df["total_trainings"]

In [5]:
df.dtypes

age                      int64
experience               int64
completed_trainings      int64
expired_trainings        int64
total_trainings          int64
gender                  object
completion_rate        float64
dtype: object

In [6]:
df

Unnamed: 0,age,experience,completed_trainings,expired_trainings,total_trainings,gender,completion_rate
0,50,17,6,4,5,Male,1.20
1,36,13,3,3,9,Female,0.33
2,29,16,8,4,11,Female,0.73
3,42,8,4,0,12,Male,0.33
4,40,19,2,4,14,Female,0.14
...,...,...,...,...,...,...,...
995,34,3,7,4,6,Female,1.17
996,51,10,9,0,14,Male,0.64
997,44,14,2,1,13,Female,0.15
998,40,6,4,4,6,Male,0.67


## Defining a method for data preprocessing

In [7]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import statsmodels.api as sm



# Assuming 'target_column' is the column you want to predict
X = df.drop(['completion_rate'], axis=1) # feature df
y = df['completion_rate'] # target

transformer = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), ['gender']))


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

import statsmodels.api as sm
import pandas as pd
# Preprocess training data
X_train_pre = transformer.fit(X_train)

X_train =  preprocessing(X_train,transformer)

X_train_OLS = sm.add_constant(X_train)

# Fit the OLS model

model_OLS = sm.OLS(y_train, X_train_OLS).fit()


In [9]:
X_test =  preprocessing(X_test,transformer) # Perfroming preprocessing on test data

In [10]:
X_train_OLS

Unnamed: 0,const,age,experience,completed_trainings,expired_trainings,total_trainings,Gender_Female,Gender_Male
541,1.00,57,18,7,0,11,0.00,1.00
440,1.00,58,12,9,1,14,1.00,0.00
482,1.00,37,11,1,3,13,1.00,0.00
422,1.00,44,11,7,2,9,1.00,0.00
778,1.00,52,8,0,1,14,1.00,0.00
...,...,...,...,...,...,...,...,...
106,1.00,26,5,3,2,10,0.00,1.00
270,1.00,40,11,1,0,12,1.00,0.00
860,1.00,47,18,6,0,5,0.00,1.00
435,1.00,57,15,3,1,8,0.00,1.00


In [11]:
X_test

Unnamed: 0,age,experience,completed_trainings,expired_trainings,total_trainings,Gender_Female,Gender_Male
521,24,14,8,3,7,1.00,0.00
737,36,9,6,0,12,1.00,0.00
740,49,2,8,2,6,0.00,1.00
660,45,17,5,2,11,0.00,1.00
411,33,12,6,2,12,0.00,1.00
...,...,...,...,...,...,...,...
468,30,12,6,4,14,1.00,0.00
935,54,2,7,1,7,1.00,0.00
428,31,3,3,1,14,1.00,0.00
7,32,14,2,2,9,1.00,0.00


In [12]:
X_test_prep = sm.add_constant(X_test, has_constant='add')

In [13]:
# Make predictions
y_pred = get_predictions(model_OLS, X_test_prep)


In [14]:
# Assuming X_test_updated and y_test are your test set with non-significant variables removed
# Assuming y_pred_updated is your predicted values
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# R-squared
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')


Mean Squared Error: 0.011024744416268106
R-squared: 0.9090569026580341


## Linear Regression

In [15]:
## Predcition using Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LinearRegression


regression_model = LinearRegression()
model_lr = train_model(X_train, y_train, regression_model)

In [16]:
y_pred_lr = get_predictions(model_lr, X_test)

In [17]:
# Evaluate performance metrics
mse_linear_reg = mean_squared_error(y_test, y_pred_lr)
r2_linear_reg = r2_score(y_test, y_pred_lr)

print(f'Linear Regression MSE: {mse_linear_reg}')
print(f'Linear Regression R-squared: {r2_linear_reg}')


Linear Regression MSE: 0.011024744416268195
Linear Regression R-squared: 0.9090569026580333


In [18]:
#Saving pickle files:

save_model_to_pickle(transformer, 'encoding.pkl')

In [19]:
save_model_to_pickle(model_lr, 'lr_Model.pkl')

In [20]:
save_model_to_pickle(model_OLS, 'OLS_Model.pkl')

In [21]:
# # Save the function and the fitted transformer
# with open('encoding.pkl', 'wb') as file:
#     pickle.dump(transformer, file)

In [22]:
# import pickle

# # Save the trained model
# with open('lr_Model.pkl', 'wb') as file:
#     pickle.dump(model_lr, file)

In [23]:
# import pickle

# # Save the trained model
# with open('OLS_Model.pkl', 'wb') as file:
#     pickle.dump(model_OLS, file)


In [24]:
from google.colab import files

files.download('OLS_Model.pkl')
files.download('encoding.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [25]:
files.download('lr_Model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>