# Getting Started

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge
R = Ridge()

from sklearn.ensemble import RandomForestRegressor
RF = RandomForestRegressor()

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
data = pd.read_csv("../input/student-alcohol-consumption/student-mat.csv")

In [None]:
data

# Data Visualization

In [None]:
plt.figure(figsize=(14, 12))
sns.heatmap(data.corr(), annot=True)
plt.show()

In [None]:
sns.catplot(y='G3',x='sex',data=data,kind='bar')
plt.title('Final Grade vs Gender',size=20)
plt.grid()

In [None]:
sns.catplot(y='G3',x='age',data=data,kind='bar')
plt.title('Final Grade vs Age',size=20)
plt.grid()

In [None]:
sns.catplot(y='G3',x='failures',data=data,kind='bar')
plt.title('Final Grade vs Number of Classes Failed',size=20)
plt.grid()

In [None]:
sns.catplot(y='G3',x='studytime',data=data,kind='bar')
plt.title('Final Grade vs Study Time',size=20)
plt.grid()

In [None]:
sns.catplot(y='G3',x='internet',data=data,kind='bar')
plt.title('Final Grade vs Internet',size=20)
plt.grid()

In [None]:
sns.catplot(y='G3',x='freetime',data=data,kind='bar')
plt.title('Final Grade vs Freetime',size=20)
plt.grid()

In [None]:
sns.catplot(y='G3',x='famrel',data=data,kind='bar')
plt.title('Final Grade vs Family Relationship',size=20)
plt.grid()

In [None]:
sns.catplot(y='G3',x='Medu',data=data,kind='bar')
plt.title('Final Grade vs Mother\'s Education',size=20)
plt.grid()

In [None]:
sns.catplot(y='G3',x='Fedu',data=data,kind='bar')
plt.title('Final Grade vs Father\'s Education',size=20)
plt.grid()

# Preprocessing

## Checking for Missing Values

In [None]:
data.isnull().sum()

## Encoding

In [None]:
data.dtypes

In [None]:
nonnumeric_columns = [data.columns[index] for index, dtype in enumerate(data.dtypes) if dtype == 'object']
nonnumeric_columns

In [None]:
for column in nonnumeric_columns:
    print(f"{column}: {data[column].unique()}")

In [None]:
data['Mjob'] = data['Mjob'].apply(lambda x: "m_" + x)
data['Fjob'] = data['Fjob'].apply(lambda x: "f_" + x)
data['reason'] = data['reason'].apply(lambda x: "r_" + x)
data['guardian'] = data['guardian'].apply(lambda x: "g_" + x)

In [None]:
data

In [None]:
dummies = pd.concat([pd.get_dummies(data['Mjob']),
                     pd.get_dummies(data['Fjob']),
                     pd.get_dummies(data['reason']),
                     pd.get_dummies(data['guardian'])],
                     axis=1)

In [None]:
dummies

In [None]:
data = pd.concat([data, dummies], axis=1)

data.drop(['Mjob', 'Fjob', 'reason', 'guardian'], axis=1, inplace=True)

In [None]:
data

In [None]:
nonnumeric_columns = [data.columns[index] for index, dtype in enumerate(data.dtypes) if dtype == 'object']

for column in nonnumeric_columns:
    print(f"{column}: {data[column].unique()}")

In [None]:
encoder = LabelEncoder()

for column in nonnumeric_columns:
    data[column] = encoder.fit_transform(data[column])

In [None]:
for dtype in data.dtypes:
    print(dtype)

In [None]:
y = data['G3']
X = data[['G1', 'G2']]

## Scaling

In [None]:
X

In [None]:
scaler = StandardScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
X

In [None]:
plt.figure(figsize=(50, 12))
sns.heatmap(data.corr(), annot=True)
plt.show()

# Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6, random_state=28)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
prediction = model.predict(X_test)

# Results

In [None]:
print(f"Model R2: {model.score(X_test, y_test)}")

In [None]:
print("R2  :", r2_score(y_test, prediction))
print("MAE :", mean_absolute_error(y_test, prediction))
print("MSE :", mean_squared_error(y_test, prediction))

In [None]:
plt.scatter(y_test, prediction)

In [None]:
R.fit(X_train,y_train)
prediction = R.predict(X_test)

In [None]:
R.score(X_test,y_test)

In [None]:
print("R2  :", r2_score(y_test, prediction))
print("MAE :", mean_absolute_error(y_test, prediction))
print("MSE :", mean_squared_error(y_test, prediction))

In [None]:
plt.scatter(y_test, prediction)

In [None]:
RF.fit(X_train, y_train)
prediction = RF.predict(X_test)

In [None]:
print("R2  :", r2_score(y_test, prediction))
print("MAE :", mean_absolute_error(y_test, prediction))
print("MSE :", mean_squared_error(y_test, prediction))

In [None]:
plt.scatter(y_test, prediction)