In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pickle



In [8]:
# Load the data (assuming you're extracting it from the database or using CSV for demo purposes)
data = pd.read_csv('student_data.csv')

# Sample Data Columns: ['id', 'name', 'batch', 'gpa', 'core_courses_score', 'hackathon_participation', 'papers_published', 'teaching_assistance']

# Features (X) and Target (y)
X = data[['student_id','name','batch','gpa', 'core_course_score', 'hackathon_participation', 'paper_presentations', 'teacher_assistance','other_contributions']]
y = data['overall_score']  # This will be our target variable which we will try to predict

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Check the data types of your columns
print(X_train.dtypes)

# If 'name' or other non-numeric columns are included, drop them
X_train = X_train.select_dtypes(include=['float64', 'int64'])
X_test = X_test.select_dtypes(include=['float64', 'int64'])


student_id                   int64
name                        object
batch                        int64
gpa                        float64
core_course_score          float64
hackathon_participation      int64
paper_presentations          int64
teacher_assistance          object
other_contributions         object
dtype: object


In [11]:
# Example: Convert categorical values to numerical (assuming binary Yes/No columns)
X_train['hackathon_participation'] = X_train['hackathon_participation'].apply(lambda x: 1 if x == 'Yes' else 0)
X_test['hackathon_participation'] = X_test['hackathon_participation'].apply(lambda x: 1 if x == 'Yes' else 0)

# You can do similar transformations for other categorical features


In [12]:
# Check for missing values and handle them
print(X_train.isnull().sum())
X_train.fillna(0, inplace=True)  # You can also choose a different strategy like mean or median
X_test.fillna(0, inplace=True)


student_id                 0
batch                      0
gpa                        0
core_course_score          0
hackathon_participation    0
paper_presentations        0
dtype: int64


In [13]:
# Now fit the model again
model = LinearRegression()
model.fit(X_train, y_train)

# Test the model
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 1.6564628570747175


In [14]:
# Save the model to a file for later use
with open('student_ranking_model.pkl', 'wb') as f:
    pickle.dump(model, f)
