In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv("edu_enrollees.csv")
df.head(2)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevant_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target,Xgrp
0,8949.0,city_103,0.92,Male,Has relevant experience,no_enrollment,Graduate,STEM,>20,,,1,36.0,1.0,train
1,29725.0,city_40,0.776,Male,No relevant experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47.0,0.0,train


### Q. 전처리

In [3]:
df = df.drop(columns = ["city", "company_size", "company_type"])
df = df.dropna()
# df.isna().sum()

In [8]:
df["experience"].unique(), df["last_new_job"].unique()

(array(['>20', '15', '13', '7', '5', '16', '4', '11', '<1', '18', '19',
        '12', '10', '9', '2', '6', '14', '3', '8', '20', '17', '1'],
       dtype=object),
 array(['1', '>4', '4', '3', '2', 'never'], dtype=object))

In [9]:
df = df.loc[~df["experience"].isin([">20", "<1"]), ]
df = df.loc[~df["last_new_job"].isin([">4", "never"]), ]

In [10]:
df["experience"] = df["experience"].astype("int")
df["last_new_job"] = df["last_new_job"].astype("int")

In [11]:
len(df)

7522

In [12]:
df_base = df.reset_index(drop = True)

### Q1.

In [14]:
df_q1 = df_base[["relevant_experience", "target"]].copy()
df_q1.head(2)

Unnamed: 0,relevant_experience,target
0,Has relevant experience,1.0
1,Has relevant experience,0.0


In [15]:
df_A = df_q1.loc[df_q1["relevant_experience"] == "No relevant experience", ]
df_B = df_q1.loc[df_q1["relevant_experience"] == "Has relevant experience", ]
len(df_A), len(df_B)

(1413, 6109)

In [18]:
# df_A["target"].value_counts(normalize = True)
val_A = df_A["target"].value_counts(normalize = True)[1]
val_B = df_B["target"].value_counts(normalize = True)[1]

In [19]:
round(val_A / val_B, 2)

1.77

### Q2.

In [20]:
df_base.columns

Index(['enrollee_id', 'city_development_index', 'gender',
       'relevant_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'last_new_job', 'training_hours',
       'target', 'Xgrp'],
      dtype='object')

In [22]:
df_q2_cat = df_base.loc[:, "gender":"major_discipline"]
df_q2_cat.head(1)

Unnamed: 0,gender,relevant_experience,enrolled_university,education_level,major_discipline
0,Male,Has relevant experience,no_enrollment,Graduate,STEM


In [26]:
df_q2_cat.columns.to_list()

In [27]:
# df_q2_dum = pd.get_dummies(df_q2_cat, columns = df_q2_cat.columns) # 시험버전
df_q2_dum = pd.get_dummies(df_q2_cat, dtype = "int") # 최신버전
df_q2_dum.head(2)

Unnamed: 0,gender_Female,gender_Male,gender_Other,relevant_experience_Has relevant experience,relevant_experience_No relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,enrolled_university_no_enrollment,education_level_Graduate,education_level_Masters,education_level_Phd,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other,major_discipline_STEM
0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1
1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1


In [30]:
pd.Series(df_q2_dum.columns).reset_index()

Unnamed: 0,index,0
0,0,gender_Female
1,1,gender_Male
2,2,gender_Other
3,3,relevant_experience_Has relevant experience
4,4,relevant_experience_No relevant experience
5,5,enrolled_university_Full time course
6,6,enrolled_university_Part time course
7,7,enrolled_university_no_enrollment
8,8,education_level_Graduate
9,9,education_level_Masters


In [None]:
[2, 4, 7, 10, 16]

In [33]:
set(range(17)) - set([2, 4, 7, 10, 16])

{0, 1, 3, 5, 6, 8, 9, 11, 12, 13, 14, 15}

In [34]:
df_q2_dum = df_q2_dum.drop(columns = df_q2_dum.columns[[2, 4, 7, 10, 16]])

In [35]:
df_q2_dum.head(2)

Unnamed: 0,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,0,1,1,0,0,1,0,0,0,0,0,0
1,0,1,1,0,0,1,0,0,0,0,0,0


In [36]:
col1 = ["target", "Xgrp"]
col2 = ["city_development_index", "experience", "last_new_job", "training_hours"]

In [39]:
df_job2 = pd.concat([df_base[col1 + col2], df_q2_dum],
                    axis = 1)

In [40]:
df_job2.head(2)

Unnamed: 0,target,Xgrp,city_development_index,experience,last_new_job,training_hours,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,1.0,train,0.92,7,1,46.0,0,1,1,0,0,1,0,0,0,0,0,0
1,0.0,train,0.92,5,1,108.0,0,1,1,0,0,1,0,0,0,0,0,0


In [None]:
model_lr = LogisticRegression(C = 100000, max_iter = 1000,
                              solver = "liblinear", random_state = 123)
model_lr.fit(X = df_job2.drop(columns = ["target", "Xgrp"]),
             y = df_job2["target"])

In [43]:
model_lr.coef_ # 회귀계수

array([[-6.11732384e+00, -2.85015530e-02,  9.56531984e-02,
        -9.26206002e-04, -1.62975865e-01, -1.36723955e-01,
        -7.60567385e-01,  5.14109835e-01, -2.81496522e-01,
         3.23116412e-01,  1.67418791e-02,  2.61665001e-01,
         1.07607195e-01,  2.43479152e-01,  3.97900227e-01,
        -4.39387752e-01]])

In [46]:
round(np.exp(model_lr.coef_).max(), 4)

1.6721

In [47]:
pd.DataFrame(np.exp(model_lr.coef_),
             columns = df_job2.columns[2:]).transpose()

Unnamed: 0,0
city_development_index,0.002204
experience,0.971901
last_new_job,1.100377
training_hours,0.999074
gender_Female,0.849612
gender_Male,0.872211
relevant_experience_Has relevant experience,0.467401
enrolled_university_Full time course,1.672149
enrolled_university_Part time course,0.754654
education_level_Graduate,1.381426


### Q3.

In [48]:
df_job2["Xgrp"].unique()

array(['train', 'test'], dtype=object)

In [49]:
df_train = df_job2.loc[df_job2["Xgrp"] == "train", ]
df_test  = df_job2.loc[df_job2["Xgrp"] == "test",  ]
len(df_train), len(df_test)

(4706, 2816)

In [50]:
df_train.shape

(4706, 18)

In [52]:
model_knn = KNeighborsClassifier(n_neighbors = 5)
model_knn.fit(X = df_train.drop(columns = ["Xgrp", "target"]),
              y = df_train["target"])
pred = model_knn.predict(df_test.drop(columns = ["Xgrp", "target"]))
pred[:5]

array([0., 0., 0., 0., 0.])

In [53]:
y_t = df_test["target"]
y_p = pred

In [58]:
pd.crosstab(y_t, y_p)

In [57]:
arr_y = pd.crosstab(y_t, y_p).values
arr_y

array([[1899,  193],
       [ 616,  108]], dtype=int64)

In [62]:
round(arr_y.diagonal().sum() / arr_y.sum(), 2)

0.71

In [63]:
from sklearn.metrics import accuracy_score

In [64]:
round(accuracy_score(y_true = y_t, y_pred = y_p), 2)

0.71

ProDS <-- 책도 문제집도 없음.  
빅데이터 분석 기사(빅분기) <-- 책이 많음 ㅎㅎ. 그러나 문제가 매우 쉬움.  
ADP <-- 책이 있긴하나.... 내용이좀... 책이 비쌈...ㅠ  

https://www.kaggle.com/datasets

In [65]:
import sklearn.metrics as sm

In [67]:
[f for f in dir(sm) if f[-5:] == "score"]

['accuracy_score',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision_score',
 'balanced_accuracy_score',
 'calinski_harabasz_score',
 'cohen_kappa_score',
 'completeness_score',
 'consensus_score',
 'd2_absolute_error_score',
 'd2_pinball_score',
 'd2_tweedie_score',
 'davies_bouldin_score',
 'dcg_score',
 'explained_variance_score',
 'f1_score',
 'fbeta_score',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard_score',
 'label_ranking_average_precision_score',
 'mutual_info_score',
 'ndcg_score',
 'normalized_mutual_info_score',
 'precision_score',
 'r2_score',
 'rand_score',
 'recall_score',
 'roc_auc_score',
 'silhouette_score',
 'top_k_accuracy_score',
 'v_measure_score']