In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv("../input/students-performance-in-exams/StudentsPerformance.csv")

In [None]:
#Необходимо предсказать оценку по математике
#lets predict math score

In [None]:
data.drop(['reading score','writing score'],axis = 1, inplace = True)

In [None]:
data.head()

In [None]:
#feature visualisation

sns.countplot(x="gender", data=data)

In [None]:
sns.countplot(x="race/ethnicity", data=data)

In [None]:
labels = data['parental level of education'].unique()
percentages = []
for label in labels:
    percentages.append(((data[data['parental level of education']==label].count()[1]/len(data))*100).round(2))
print(percentages)
print(sum(percentages))

In [None]:
ax = sns.barplot(x = percentages, y = labels)
ax.set_xlabel ("Percentage")
ax.set_title("Parental level of education in %", fontsize=15)

In [None]:
sns.countplot(x='lunch', data = data)

In [None]:
sns.countplot(x='test preparation course', data = data)

In [None]:
ax = data['math score'].hist()
ax.set_xlabel('Math score'); ax.set_ylabel('count')

In [None]:
ax = data.boxplot(column = 'math score', by='gender')
ax.set_ylabel('Math score')

In [None]:
data.isnull().sum()

In [None]:
ax = data.boxplot(column = 'math score', by='race/ethnicity')
ax.set_ylabel('Math score')

In [None]:
'''
Видно, что у мужчин балл по математике в среднем выше
однако у женщин есть выбросы, у кого-то вообще 0 баллов.
Возможно, это аннулированная за списывание. В любом случае, такие
выбросы сильно влияют на результат

Удаляю выбросы с помощью интерквартильного размаха

It can be seen that men have a higher score in mathematics on average,
but women have outliers, someone has 0 points at all.
Perhaps it is canceled for cheating. In any case, such
outliers greatly affect the result

I remove outliers with the help of IQR
'''

In [None]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
data_no_out = data[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]
print(data.shape)
print(data_no_out.shape)

In [None]:
ax = data_no_out.boxplot(column = 'math score', by='gender')
ax.set_ylabel('Math score')

In [None]:
'''
После удаления выбросов, все равно преимущество за мужчинами. Проверю, есть ли 
значимые различие с помощью т-теста

After removing the outliers, men still have the advantage. I will check if there are any
significant differences using the t-test
'''

In [None]:
print('female median math score: ',data_no_out[data_no_out['gender'] == 'female']['math score'].median())
print('male median math score: ',data_no_out[data_no_out['gender'] == 'male']['math score'].median())

In [None]:
math_scores_f = data_no_out[data_no_out['gender'] == 'female']['math score']
print(len(math_scores_f))
math_scores_m = data_no_out[data_no_out['gender'] == 'male']['math score']
print(len(math_scores_m))

In [None]:
math_scores_f_sample_400 = math_scores_f.sample(n=400, random_state=55)
math_scores_m_sample_400 = math_scores_m.sample(n=400, random_state=55)
print(math_scores_f_sample_400.median())
print(math_scores_m_sample_400.median())
print('--------------')
print(math_scores_f_sample_400.mean())
print(math_scores_m_sample_400.mean())

In [None]:
print(math_scores_f_sample_400.std())
print(math_scores_m_sample_400.std())

In [None]:
#H0 - no differences

In [None]:
from scipy.stats import ttest_ind
ttest,pval = ttest_ind(math_scores_f_sample_400,math_scores_m_sample_400)
print("p-value",pval)
if pval <0.05:
  print("we reject null hypothesis")
else:
  print("we accept null hypothesis")

In [None]:
#0.000066 - p-value, получается - различия есть

In [None]:
#Обработка кат. переменных

In [None]:
X, y = data_no_out.drop('math score',axis=1), data_no_out['math score']
X_train, X_test, y_train, y_test = train_test_split(
    X,y,random_state = 55, test_size = 0.3, shuffle = True
)

In [None]:
categorial_cols = data_no_out.select_dtypes(include='object').columns.to_list()
categorial_pipe = make_pipeline(
OneHotEncoder(sparse = False, handle_unknown = 'ignore')
)

full = ColumnTransformer(
    transformers=[
        ('categorial',categorial_pipe,categorial_cols)
    
    ]
)

pipeline = Pipeline(
steps = [
    ('preprocess',full),
    #('poly',PolynomialFeatures(degree=2)),
    ('base',Lasso())
]
)

SGDpipeline = pipeline.fit(X_train,y_train)

In [None]:
y_pred = SGDpipeline.predict(X_test)
res = r2_score(y_test,y_pred)
print('R2 score is: ',res)
mse = mean_squared_error(y_test,y_pred)
print('MSE is ',mse)

In [None]:
'''
Ужасный результат, но это вполне ожидаемо, тк все фичи категориальные. Так что не стоит решать регрессией.
Переведу оценки в школьную шкалу оценок и буду решать задачу многоклассовой классификации.

A terrible result, but this is quite expected, because all the features are categorical. So it's not worth solving with regression.
I will transfer the grades to the school grade scale and will solve the problem of multiclass classification.
'''

In [None]:
score_bins = [0, 39, 59, 79, 100]
marks = ['2','3','4','5']
score_cat = pd.cut(data_no_out['math score'],score_bins, labels = marks)

In [None]:
data_no_out['math score'] = score_cat

In [None]:
data_no_out.head(2)

In [None]:
sns.countplot(x='math score', data = data_no_out)

In [None]:
X, y = data_no_out.drop('math score',axis=1), data_no_out['math score']
X_train, X_test, y_train, y_test = train_test_split(
    X,y,random_state = 55, test_size = 0.3, shuffle = True
)

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier






categorial_cols = data_no_out.select_dtypes(include='object').columns.to_list()
categorial_pipe = make_pipeline(
OneHotEncoder(sparse = False, handle_unknown = 'ignore')
)
full = ColumnTransformer(
    transformers=[
        ('categorial',categorial_pipe,categorial_cols)
        
    
    ]
)

pipeline = Pipeline(
steps = [
    ('preprocess',full),
    ('base',OneVsRestClassifier(GradientBoostingClassifier()))
]
)

xgb_m_cl = pipeline.fit(X_train,y_train)

In [None]:
from sklearn.metrics import classification_report
y_pred = xgb_m_cl.predict(X_test)

res = classification_report(y_test,y_pred)
print(res)

In [None]:
'''я попробовал несколько классификаторов и все примерно вышли с результатом f1score = 0.5
Учитывая, что при GradientBoostingClassifier получился самый лучший результат для оценки 4(самая популярная метка),
я думаю, что лучше резуьтата не будет, тк в этой задаче есть такой недостаток, как нехватка данных. И в целом мало данных(996 
наблюдений) и по всем классам кроме 4 - малый % содержания меток в общем наборе

Вывод: не получается построить хорушую модель предсказания, тк есть проблемы с данными. Единственное, что удалось выяснить - у мальчиков в среднем 
выше оценка по математике. 
'''

In [None]:
'''
I tried several classifiers and all approximately came out with the result f1score = 0.5
Given that the GradientBoostingClassifier turned out to be the best result for rating 4(the most popular label),
I think that there will be no better result, because there is such a disadvantage in this problem as a lack of data. And in general, there is little data(996
observations) and for all classes except 4-small % of the content of labels in the general set

Conclusion: it is not possible to build a good prediction model, because there are problems with the data. The only thing that we managed to find out is that boys have an average
higher grade in mathematics.'''