In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#İmport Section
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

## Information
#### Columns 

- 'school'
- 'school_setting'
- 'school_type'
- 'classroom',
- 'teaching_method'
- 'n_student'
- 'student_id'
- 'gender'
- 'lunch'
- 'pretest'
- 'posttest'
- 'mean'
- 'success'
   
## **Students with an average of more than 60 are considered to have passed the class.**<br>
#### If passed the class success featue 1 else 0

In [None]:
df=pd.read_csv("../input/predict-test-scores-of-students/test_scores.csv")
df.shape


In [None]:
df.info() #there is no missing value

In [None]:
df.describe()

In [None]:
df.head(10)

In [None]:
#firstly ı want to add new column which is mean of pretest and posttest
df["mean_"]=(df.loc[:,"pretest"]+df.loc[:,"posttest"])/2
df

In [None]:
#new column if mean_>60 successful so 1 else 0 
df["success"]=[1 if i>=60 else 0 for i in df.mean_]
df

I converted string values in some features to float.<br>
I think they are effective in success

In [None]:
#gender 0=Male 1 =Female
df["gender"]=[0 if i=='Male'else 1 for i in df.gender] 
df

In [None]:
df.school_type.unique()
df.school_type=[0 if i=="Non-public" else 1 for i in df.school_type]
df

In [None]:
df.school_setting.unique()
df.school_setting=[0 if i=="Urban"  else (0.5 if i=="Suburban" else 1  )for i in df.school_setting]
df

In [None]:
df.teaching_method.unique()
df.teaching_method=[0 if i=="Standard" else 1 for i in df.teaching_method]
df

In [None]:
df.lunch.unique()
df.lunch=[0 if i=="Does not qualify" else 1 for i in df.lunch ]
df

## Basic Analysis


In [None]:
pd.plotting.scatter_matrix(df,figsize=(15,15),diagonal="hist",alpha=0.5,s=100)
plt.show()

In [None]:
sns.heatmap(df.corr(),annot=True)

## Supervised Learning part

In [None]:
df.drop(["school","classroom","student_id"],axis=1,inplace=True)#We have removed the features we will not use
df

In [None]:
y=df.success.values
x=df.drop(["success"],axis=1)
x

In [None]:
y

In [None]:
#x normalization 
#During normalization, the values in the gender feature were nan, so I removed it and put it back in the data.
x_q=x["gender"]
x.drop(["gender"],axis=1,inplace=True)
x_ = (x - np.min(x))/(np.max(x)-np.min(x)).values
x_=pd.concat([x_,x_q],axis=1)
x_

In [None]:
#train-test split
x_train, x_test, y_train, y_test = train_test_split(x_,y,test_size = 0.2,random_state=42)

x_train = x_train.T
x_test = x_test.T
y_train = y_train.T
y_test = y_test.T

print("x_train: ",x_train.shape)
print("x_test: ",x_test.shape)
print("y_train: ",y_train.shape)
print("y_test: ",y_test.shape)

In [None]:
#parameter initialize and sigmoid function
def initialize_weights_and_bias(dimension):
    
    w = np.full((dimension,1),0.01)
    b = 0.0
    return w,b

In [None]:
#sigmoid funciton
def sigmoid(z):
    
    y_head = 1/(1+ np.exp(-z))
    return y_head
# print(sigmoid(0))

In [None]:
# backward_forward_propagation
def forward_backward_propagation(w,b,x_train,y_train):
    # forward propagation
    z = np.dot(w.T,x_train) + b
    y_head = sigmoid(z)
    loss = -y_train*np.log(y_head)-(1-y_train)*np.log(1-y_head)
    cost = (np.sum(loss))/x_train.shape[1]      # x_train.shape[1]  is for scaling
    
    # backward propagation
    derivative_weight = (np.dot(x_train,((y_head-y_train).T)))/x_train.shape[1] # x_train.shape[1]  is for scaling
    derivative_bias = np.sum(y_head-y_train)/x_train.shape[1]                 # x_train.shape[1]  is for scaling
    gradients = {"derivative_weight": derivative_weight, "derivative_bias": derivative_bias}
    
    return cost,gradients

In [None]:
#Updating (learning) parameters 
def update(w, b, x_train, y_train, learning_rate,number_of_iterarion):
    cost_list = []
    cost_list2 = []
    index = []
    
    # updating(learning) parameters is number_of_iterarion times
    for i in range(number_of_iterarion):
        # make forward and backward propagation and find cost and gradients
        cost,gradients = forward_backward_propagation(w,b,x_train,y_train)
        cost_list.append(cost)
        # lets update
        w = w - learning_rate * gradients["derivative_weight"]
        b = b - learning_rate * gradients["derivative_bias"]
        if i % 10 == 0:
            cost_list2.append(cost)
            index.append(i)
            print ("Cost after iteration %i: %f" %(i, cost))
            
    # we update(learn) parameters weights and bias
    parameters = {"weight": w,"bias": b}
    plt.plot(index,cost_list2)
    plt.xticks(index,rotation='vertical')
    plt.xlabel("Number of Iterarion")
    plt.ylabel("Cost")
    plt.show()
    return parameters, gradients, cost_list


In [None]:
# prediction
def predict(w,b,x_test):
    # x_test is a input for forward propagation
    z = sigmoid(np.dot(w.T,x_test)+b)
    Y_prediction = np.zeros((1,x_test.shape[1]))
    # if z is bigger than 0.5, our prediction is sign one (y_head=1),
    # if z is smaller than 0.5, our prediction is sign zero (y_head=0),
    for i in range(z.shape[1]):
        if z[0,i]<= 0.5:
            Y_prediction[0,i] = 0
        else:
            Y_prediction[0,i] = 1

    return Y_prediction

In [None]:
#  logistic_regression
def logistic_regression(x_train, y_train, x_test, y_test, learning_rate ,  num_iterations):
    # initialize
    dimension =  x_train.shape[0]  
    w,b = initialize_weights_and_bias(dimension)
    # do not change learning rate
    parameters, gradients, cost_list = update(w, b, x_train, y_train, learning_rate,num_iterations)
    
    y_prediction_test = predict(parameters["weight"],parameters["bias"],x_test)

    # Print test Errors
    print("test accuracy: {} %".format(100 - np.mean(np.abs(y_prediction_test - y_test)) * 100))
    
logistic_regression(x_train, y_train, x_test, y_test,learning_rate = 2, num_iterations = 350)    


In [None]:
#sklearn with LR
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(x_train.T,y_train.T)
print("test accuracy {}".format(lr.score(x_test.T,y_test.T)))

## Unsupervised Learning

In [None]:
data=df[["pretest","posttest"]]
plt.scatter(data.pretest,data.posttest)


In [None]:
#With K-means
from sklearn.cluster import KMeans
wcss=[]

for k in range(1,20):
    kmeans=KMeans(n_clusters=k,random_state=42)
    kmeans.fit(data)
    wcss.append(kmeans.inertia_)
plt.plot(range(1,20),wcss)
plt.show()

#ı take 10 for k

In [None]:
kmeans=KMeans(n_clusters=10,random_state=42)
clusters=kmeans.fit_predict(data)

data["label"]=clusters

sns.scatterplot(data.pretest,data.posttest,hue=data.label)
plt.show()


In [None]:
data.groupby(by=["label"]).mean()
((data.pretest+data.posttest)/2).groupby(by=data["label"]).mean()

In [None]:
test_score=["CC","BA","AB","CA","FF","BC","DF","AA","BB","CB"]

In [None]:
##initilaze notes


data.label=[test_score[i]  for i in data.label]
data.label
        

In [None]:
data.sort_values(by=["label"],inplace=True)
sns.scatterplot(data.pretest,data.posttest,hue=data.label)
plt.legend()
plt.show()
