In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Read Datase

In [None]:
df = pd.read_csv('/kaggle/input/predict-test-scores-of-students/test_scores.csv')
df

## Cleaning and formatting data

In [None]:
# Concise summary of a DataFrame
df.info()

In [None]:
# Detect missing values for an array-like object
pd.isnull(df).any()

In [None]:
# Convert categorical features 
df['school_type'] = df['school_type'].map({'Public': 1, 'Non-public': 0})
df['teaching_method'] = df['teaching_method'].map({'Standard': 1, 'Experimental': 0})
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})
df['lunch'] = df['lunch'].map({'Does not qualify': 1, 'Qualifies for reduced/free lunch': 0})

school = pd.get_dummies(df['school'], prefix='school')
school_setting = pd.get_dummies(df['school_setting'], prefix='school_setting')
classroom = pd.get_dummies(df['classroom'], prefix='classroom')

In [None]:
# Creation of a new DataFrame
df = pd.concat([df, school, school_setting, classroom], axis=1)
drop = ['school', 'school_setting', 'classroom', 'student_id']
df.drop(drop, axis=1, inplace=True)
df

## Visualization

In [None]:
# Adding histograms of features
df['posttest'].hist();
plt.title('posttest')

In [None]:
# Let's see emissions
f, axes = plt.subplots(1, 3,figsize=(10,4))
sns.boxplot(data=df, x='posttest', ax=axes[0])
sns.boxplot(data=df, x='pretest', ax=axes[1])
sns.boxplot(data=df, x='n_student', ax=axes[2]);

## Train and evaluate a linear regression model with regularization

In [None]:
# Split arrays or matrices into random train and test subsets
X=df.drop('posttest', axis=1)
y=df['posttest']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=17)

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

In [None]:
# Building and training the model
ridge_cv = RidgeCV()
ridge_cv.fit(X_train_scaled, y_train)

In [None]:
# Evaluating the model on a test dataset
ridge_cv_predict = ridge_cv.predict(X_valid_scaled)


In [None]:
# Learning Assessment
print('Test data score:', ridge_cv.score(X_valid_scaled, y_valid))
print('MAE:', mean_absolute_error(ridge_cv_predict, y_valid))

In [None]:
# Linear Regression Coefficients
pd.DataFrame(data=ridge_cv.coef_, index=X_train.columns, columns=['coef']).sort_values(by='coef', ascending=False)