In [1]:

# Install the desired scikit-learn version (1.3.0 in this example)
!pip install scikit-learn==1.3.0





[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder

In [22]:
# Load the dataset
data = pd.read_csv("Score Prediction Model Dataset.csv")



In [23]:
print(data.columns)


Index(['Job role', 'Programming and Software Development',
       'Data Science and Analytics', 'Database Management', 'Cloud Computing',
       'Project Management', 'Cybersecurity',
       'IT Infrastructure and Networking',
       'Artificial Intelligence and Machine Learning', 'System Administration',
       'User Experience and Design', 'Final Score'],
      dtype='object')


In [24]:
# Data Preprocessing
# Remove duplicate rows
#data.drop_duplicates(inplace=True)
# Remove duplicate rows based on the "Job Role" column
data.drop_duplicates(subset=["Job role"], keep="first", inplace=True)


# Fill null values with 0
data.fillna(0, inplace=True)

In [26]:
# Define X and y
X = data.drop(columns=["Final Score"])
y = data["Final Score"]

In [27]:
# Before the transformation
print("Unique Job Roles before encoding:", X['Job role'].nunique())

Unique Job Roles before encoding: 2108


In [28]:
column_transformer = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), ['Job role'])],  # Replace 'Job Role' with your actual column name
    remainder='passthrough'
)

# Transform the X data including CountVectorizer encoding
X_encoded = column_transformer.fit_transform(X)

# After the transformation
print("X_encoded shape:", X_encoded.shape)  # This should match the number of rows in your dataset



X_encoded shape: (2108, 2118)


In [29]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [10]:
# Train the RandomForestRegressor model
rf_regressor = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),  # Add polynomial features
    ('scaler', StandardScaler(with_mean=False)),  # Standardize features without centering
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

rf_regressor.fit(X_train, y_train)

# Predict
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")

Mean Squared Error: 75.54
R-squared (R2) Score: -0.08


In [32]:
# Train the RandomForestRegressor model
linear_regressor = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),  # Add polynomial features
    ('scaler', StandardScaler()),  # Standardize features
    ('regressor', LinearRegression())
])



linear_regressor.fit(X_train, y_train)

# Predict
y_pred = linear_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\shanali de silva\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\shanali de silva\AppData\Local\Temp\ipykernel_5460\1648291733.py", line 10, in <module>
    linear_regressor.fit(X_train, y_train)
  File "c:\Users\shanali de silva\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
  File "c:\Users\shanali de silva\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 416, in fit
    Parameters
         ^^^^^^
  File "c:\Users\shanali de silva\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 370, in _fit
    # from the cache.
                      
  File "c:\Users\shanali de silva\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\memory.py", line 353, in __call__
    return self.func

In [33]:
# Train the RandomForestRegressor model
svm_regressor = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),  # Add polynomial features
    ('scaler', StandardScaler()),  # Standardize features
    ('regressor', SVR(kernel='linear', C=1.0))
])


svm_regressor .fit(X_train, y_train)

# Predict
y_pred = svm_regressor .predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\shanali de silva\AppData\Roaming\Python\Python311\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\shanali de silva\AppData\Local\Temp\ipykernel_5460\3335371031.py", line 9, in <module>
    svm_regressor .fit(X_train, y_train)
  File "c:\Users\shanali de silva\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
  File "c:\Users\shanali de silva\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 416, in fit
    Parameters
         ^^^^^^
  File "c:\Users\shanali de silva\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 370, in _fit
    # from the cache.
                      
  File "c:\Users\shanali de silva\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\memory.py", line 353, in __call__
    return self.func(*a

In [13]:
# import pickle

# # Save the model to a file using pickle
# with open('random_forest_model.pkl', 'wb') as model_file:
#     pickle.dump(rf_regressor, model_file)


In [20]:
import joblib
joblib.dump(rf_regressor, 'RandomForestRegressor.joblib')


['RandomForestRegressor.joblib']

In [15]:
import sklearn
print(sklearn.__version__)


1.3.0
