In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

plt.style.use('seaborn')
sns.set(font_scale=1.5)

In [2]:
student = pd.read_csv('../student/student-mat.csv')

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Normalizer

In [4]:
ct = ColumnTransformer([('scaling', Normalizer(),  ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 
                                                        'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']),
                       ('onehot', OneHotEncoder(sparse = False),
                       ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'guardian','schoolsup', 'famsup', 'paid', 'activities', 
             'nursery', 'higher', 'internet', 'romantic'  ])]    )

In [5]:
student.drop('reason', axis = 1, inplace = True)

In [6]:
student['aver'] =( student['G1'] + student['G2'] ) / 2

In [7]:
X, y = (student.drop(['G1', 'G2', 'G3'], axis = 1)), student.G3

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10, test_size = 0.2)

In [9]:
X_train_trans = ct.fit_transform(X_train)
X_test_trans = ct.transform(X_test)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

In [11]:
param_grid = {'C' : [0.001, 0.01, 0.1, 1, 10, 100]}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv = 5, return_train_score = True)
grid_search.fit(X_train_trans, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [12]:
print(grid_search.score(X_test_trans, y_test))

print("최고 교차 검증 점수: ", grid_search.best_score_)
print("최적 매개변수 : ", grid_search.best_params_)

0.11392405063291139
최고 교차 검증 점수:  0.16772151898734178
최적 매개변수 :  {'C': 0.1}


In [16]:
pred = grid_search.predict(X_test_trans)
print(pred)

[15 11  8 10 10 10 10  9 10  0 10 11  0  8 11 10 10 11  9 11  8 10 10 10
 11  0 11 10 10 10 10 10 11 10  6 10 10 10 11 11 10 10  0 10 15 11 10  0
 10 10 10 10 10 10  0 10 13 10 15  0 13 15  0 10 10  0  0 15 11 10 13  0
 10  0  9  8 11  8 11]


In [18]:
ytest = y_test.values
print(ytest)

[10  9 12 11 10  9 11 15 10 16 15  5 12  5 17 10  9 15  8  8 13 14 11 11
 12 12  8 15  6  8 13  9  9  0 11 11  9 10 11 14 14 11 14 10 20 10 11  7
 12 13 11  8 12 13  0  8 12  0 19  0 11 14 11 18  0 17  9 18 12 10 10 10
 12  7 10 15  9  9  8]


In [22]:
student['school']= student.school.map({'GP' : 0, 'MS' :  1})
student.sex = student.sex.map({'F' : 0, 'M' :  1})
student.address = student.address.map({'U' : 0, 'R' :  1})
student.famsize = student.famsize.map({'LE3' : 0, 'GT3' :  1})
student.Pstatus = student.Pstatus.map({'T' : 0, 'A' :  1})
student.Mjob = student.Mjob.map({'at_home' : 0, 'health' :  1, 'other' : 2, 'services' : 3, 'teacher' : 4})
student.Fjob= student.Fjob.map({'at_home' : 0, 'health' :  1, 'other' : 2, 'services' : 3, 'teacher' : 4})
student.guardian = student.guardian.map({'mother' : 0, 'father' :  1, 'other' : 2})

In [23]:
col_names = ['schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']

for i in col_names:
    student[i]= student[i].map({'yes' : 0, 'no' :  1})

In [24]:
X, y = (student.drop(['G1', 'G2', 'G3'], axis = 1)), student.G3

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10, test_size = 0.2)

In [26]:
param_grid = {'C' : [0.001, 0.01, 0.1, 1, 10, 100]}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv = 5, return_train_score = True)
grid_search.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [27]:
print(grid_search.score(X_test, y_test))

print("최고 교차 검증 점수: ", grid_search.best_score_)
print("최적 매개변수 : ", grid_search.best_params_)

0.21518987341772153
최고 교차 검증 점수:  0.31962025316455694
최적 매개변수 :  {'C': 0.01}


In [29]:
pred = grid_search.predict(X_test)
print(pred)

[ 0 10 10 15  0 10 15 15  0 10 15  0  0  8 14 10  0 15 11 11 11 15 15 15
 13 15 11 13  0 10 10  0 11  0 11 10 10 10 10 11 13  0 10 11 15 11 10 10
 10 11 11 10 10 15  0  0 11  0 15  0 15 15 11 15  0 15 10 15 11 11 10 10
 11  0 10 15  8 11  0]


In [30]:
ytest = y_test.values
print(ytest)

[10  9 12 11 10  9 11 15 10 16 15  5 12  5 17 10  9 15  8  8 13 14 11 11
 12 12  8 15  6  8 13  9  9  0 11 11  9 10 11 14 14 11 14 10 20 10 11  7
 12 13 11  8 12 13  0  8 12  0 19  0 11 14 11 18  0 17  9 18 12 10 10 10
 12  7 10 15  9  9  8]


# 평균차이 유사도

In [15]:
np.sqrt(((y_test - grid_search.predict(X_test_trans)) ** 2).sum()) / len(X_test)

0.6131079006740423

In [32]:
np.sqrt(((y_test - grid_search.predict(X_test)) ** 2).sum()) / len(X_test)

0.45692520627291255

#  cosine similarity

In [21]:
(pred * ytest).sum() / ((np.sqrt(pred**2).sum()) * (np.sqrt(ytest**2).sum()))

0.013102344853852576

In [33]:
(pred * ytest).sum() / ((np.sqrt(pred**2).sum()) * (np.sqrt(ytest**2).sum()))

0.014548893610405763