In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import   r2_score, accuracy_score
from sklearn.model_selection import GridSearchCV , train_test_split

In [None]:
data = pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv')


In [None]:
data.head()

In [None]:
cat_cols = ['gender',
            'race/ethnicity',
            'parental level of education',
            'lunch'
            ]
num_cols = [
            'math score',
            'reading score',
            'writing score'
]
data['Total Score'] = data['writing score'] + data['math score'] + data['reading score']
data['Mean Score'] = ((data['writing score'] + data['math score'] + data['reading score']) // 3)

In [None]:
print(data.shape)

In [None]:
data.head()

## The average of the final result and for each of the subjects

In [None]:
print('Total' ,np.mean(data['Total Score'] / 3))
print('Reading' , np.mean(data['reading score']))
print('Math' , np.mean(data['math score']))
print('Writing' ,np.mean(data['writing score']))

## Displaying data for numerical features

In [None]:
plt.figure(figsize=(8, 30), dpi=100)
n = len(num_cols)
for i, col in enumerate(num_cols):
    plt.subplot(2*n, 2, 2*i+1)
    plt.hist(data[col], bins=10)
    plt.title(col)
    plt.subplot(2*n, 2, 2*i+2)
    plt.boxplot(data[col].values)

## Displaying data for categorical features

In [None]:
plt.figure(figsize=(10, 100), dpi=100)
n = len(cat_cols)
for i, col in enumerate(cat_cols):
    value_counts = data[col].value_counts()
    plt.subplot(2*n,2,2*i+1)
    plt.pie(value_counts, labels=value_counts.index)
    plt.subplot(2*n,2,2*i+2)
    plt.bar(np.arange(len(value_counts)), value_counts, tick_label=value_counts.index)
    plt.ylabel(col)

In [None]:
f = sns.pairplot(data[num_cols]);
f.fig.set_size_inches(10,10)

In [None]:
n = len(cat_cols)
for i, col in enumerate(cat_cols):
    value_counts = data[col].value_counts()
    plt.subplot(n,1,i+1)
    value_counts.plot(kind="barh", figsize=(15, 60))
    plt.ylabel(col)

## Processing of categorical features

In [None]:
le = LabelEncoder()
for col in cat_cols:
  data[col] = le.fit_transform(data[col])
  data[col]  = data[col].astype('int')
data['test preparation course'] = le.fit_transform(data['test preparation course'])

In [None]:
data.head()

In [None]:
X = data.drop(['gender'], axis= 1)
y = data['gender']
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size = 0.2)

## LogisticRegression and GridSearchCV


In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression();
model.fit(X_train,y_train);
pred_model = model.predict(X_train)

In [None]:
print("Logistic Regression  Score : ",model.score(X_test,y_test))

In [None]:
params = {'C' : [0.0001 , 0.001 , 0.01 , 0.1 , 1 , 10 , 100]}

In [None]:
model_cv = GridSearchCV(model , param_grid= params , cv = 5 , n_jobs= -1)
model_cv.fit(X_train , y_train)
pred_model_cv = model_cv.predict(X_train)

In [None]:
print("Logistic Regression(GridSearchCV) : ",model_cv.score(X_test,y_test))

## RandomForestClassifier and GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_tree = RandomForestClassifier()
random_tree.fit(X_train,y_train);
pred_model = random_tree.predict(X_train)

In [None]:
print("RandomForestClassifier  Score : ",random_tree.score(X_test,y_test))

In [None]:
params_tree = {'max_depth' : [4, 5 , 6 , 7 , 8 , 9 , 10 , 11],
          'n_estimators' : [100,500]}

In [None]:
random_tree_cv = GridSearchCV(random_tree , param_grid= params_tree , cv = 5 , n_jobs= -1)
random_tree_cv.fit(X_train,y_train);
pred_model = random_tree_cv.predict(X_train)

In [None]:
print("RandomForestClassifier(GridSearchCV)  Score : ",random_tree_cv.score(X_test,y_test))

## KNeighborsClassifier and GridSearchCV

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
knn_pred = knn.predict(X_test)

In [None]:
print("KNeighborsClassifier  Score : ",knn.score(X_test,y_test))

In [None]:
params_knn = {'n_neighbors' : [4, 5 , 6 , 7 , 8 , 9 , 10 , 11],
          'weights' : ['distance']}

In [None]:
knn_cv = GridSearchCV(knn , param_grid= params_knn , cv = 5 , n_jobs= -1)
knn_cv.fit(X_train,y_train)
knn_cv_pred = knn_cv.predict(X_test)

In [None]:
print("KNeighborsClassifier(GridSearchCV)  Score : ",knn_cv.score(X_test,y_test))