In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data=pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')

In [None]:
data.head(5)

In [None]:
data['test preparation course']=[1 if each=='completed' else 0  for each in data['test preparation course']]
data['lunch']=[1 if each=='standard' else 0  for each in data['lunch']]
data['gender']=[1 if each=='male' else 0  for each in data['gender']]
data['race/ethnicity']=[0 if each=='group A' else 1 if each=='group B'  else 2 if each=='group C'  else 3 if each=='group D'  else 4 for each in data['race/ethnicity']]
data['parental level of education']=[0 if each=="some high school" else 1 if each=="high school"  else 2 if each=='some college'  else 3 if each=="associate's degree"  else 4 if each =="bachelor's degree" else 5 for each in data['parental level of education']]

In [None]:
for i in range(1,5,1):
    g=sns.factorplot(x=data.columns[i],y='gender',data=data,kind='bar',size=6)
    g.set_ylabels('Gender')
    plt.show()

In [None]:
x=data.drop('gender', axis=1)
y=data.gender

In [None]:
x=(x-np.min(x))/(np.max(x)-np.min(x))
x

In [None]:
x=pd.get_dummies(x,columns=['race/ethnicity'], prefix='race')
x=pd.get_dummies(x,columns=['parental level of education'], prefix='pl')
x=pd.get_dummies(x,columns=['lunch'], prefix='lunch')
x=pd.get_dummies(x,columns=['test preparation course'], prefix='course')

In [None]:
x

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [None]:
x_train, x_test, y_train, y_test= train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
random_state=42
classifier =[DecisionTreeClassifier(random_state=random_state),
             SVC(random_state=random_state),
             RandomForestClassifier(random_state=random_state),
             LogisticRegression(random_state=random_state),
             KNeighborsClassifier()]
dt_param_grid={'min_samples_split': range(10,500,20),
               'max_depth': range(1,20,2)}

svc_param_grid={'kernel': ['rbf'],
                'gamma': [0.001,0.01,0.1,1],
                'C':[1,10,50,100,200,300,500,1000]}
rf_param_grid={'max_features':[1,3,10],
               'min_samples_split':[2,3,10],
               'min_samples_leaf':[1,3,10],
               'bootstrap':[False],
               'n_estimators':[100,300],
               'criterion':['gini']}

logreg_param_grid={'C': np.logspace(-3,3,7),
                   'penalty':['l1','l2']}

knn_param_grid={'n_neighbors':np.linspace(1,19,10, dtype=int ).tolist(),
                'weights': ['uniform','distance'],
                'metric' : ['euclidean','manhattan']}


classifier_param=[dt_param_grid,
                  svc_param_grid,
                  rf_param_grid,
                  logreg_param_grid,
                  knn_param_grid]

In [None]:
cv_result=[]
best_estimators=[]
for i in range(len(classifier)):
    clf=GridSearchCV(classifier[i],param_grid=classifier_param[i],cv=StratifiedKFold(n_splits=10),scoring='accuracy',n_jobs=-1,verbose=1)
    clf.fit(x_train,y_train)
    cv_result.append(clf.best_score_)
    best_estimators.append(clf.best_estimator_)
    print(cv_result[i])

In [None]:

cv_result=pd.DataFrame({'Cross Validation Means':cv_result,'ML_Models':['DecisionTreeClassifier','SVM',
                                                                         'RandomForestClassifier',
                                                                         'LogisticRegression',
                                                                         'KNeighborsClassifier']})
g=sns.barplot('Cross Validation Means','ML_Models',data=cv_result)
g.set_xlabel('Mean Accuracy')
g.set_title('Cross Validation Scores')
plt.show()

In [None]:
votingC=VotingClassifier(estimators=[
                                     ('rfc',best_estimators[2]),
                                     ('lr',best_estimators[3])],
                                    voting='soft',n_jobs=-1)

votingC=votingC.fit(x_train,y_train)
print('accuracy score',accuracy_score(votingC.predict(x_test),y_test))

In [None]:
test_gender=pd.Series(votingC.predict(x),name='gender').astype(int)
results = pd.concat([test_gender],axis=1)
results.to_csv('test.csv',index=False)

In [None]:
test_gender