In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv("galaxy_users.csv")
df.head(2)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


### Q1.

In [3]:
df_q1 = df.loc[:, "OnlineSecurity":"StreamingMovies"].copy()
df_q1.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,No,No,No


In [4]:
df_q1["OnlineSecurity"].unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [5]:
for n in range(df_q1.shape[1]):
    print(df_q1.columns[n], df_q1.iloc[:, n].unique())

OnlineSecurity ['No' 'Yes' 'No internet service']
OnlineBackup ['Yes' 'No' 'No internet service']
DeviceProtection ['No' 'Yes' 'No internet service']
TechSupport ['No' 'Yes' 'No internet service']
StreamingTV ['No' 'Yes' 'No internet service']
StreamingMovies ['No' 'Yes' 'No internet service']


In [6]:
df_q1_sub = df_q1.loc[df_q1["OnlineSecurity"] != "No internet service", ]

In [7]:
df_q1_sub["OnlineBackup"].unique()

array(['Yes', 'No'], dtype=object)

In [8]:
df_q1.loc[df_q1["OnlineSecurity"] == "No internet service", ]

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
11,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
16,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
21,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
22,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
33,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
...,...,...,...,...,...,...
7006,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
7008,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
7009,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service
7019,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service


In [9]:
# df_q1.unique()
# df_q1.apply(lambda x: [x.unique()]) # 시험버전(0.25.2)
df_q1.apply(lambda x: x.unique())

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,Yes,Yes,Yes
2,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service


In [10]:
df_q1_sub.apply(lambda x: x.unique())

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No,Yes,No,No,No,No
1,Yes,No,Yes,Yes,Yes,Yes


#### 만약 "No internet service"가 불규칙하게 배치된 경우

In [11]:
df_q1_sub = df_q1.copy()

for n in range(df_q1_sub.shape[1]):
    df_q1_sub = df_q1_sub.loc[df_q1_sub.iloc[:, n] != "No internet service", ]

In [12]:
df_q1_sub = df_q1.replace("No internet service", np.nan).copy()

In [13]:
df_q1_sub.isna().sum()

OnlineSecurity      1520
OnlineBackup        1520
DeviceProtection    1520
TechSupport         1520
StreamingTV         1520
StreamingMovies     1520
dtype: int64

In [14]:
df_q1_sub = df_q1_sub.dropna()

In [15]:
df_q1_sub.isna().sum()

OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
dtype: int64

In [16]:
df_q1_sub = df_q1_sub.replace({"Yes": 1, "No": 0})

In [17]:
df_q1_sub["yes_cnt"] = df_q1_sub.sum(axis = 1)

In [18]:
df_q1_sub.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,yes_cnt
0,0,1,0,0,0,0,1
1,1,0,1,0,0,0,2


In [19]:
df_q1_sub["yes_cnt"].value_counts()

yes_cnt
3    1117
2    1033
1     966
4     850
0     693
5     569
6     284
Name: count, dtype: int64

In [20]:
round(966 / 284, 1)

3.4

### Q2.

In [21]:
df_q2 = df[["tenure", "MonthlyCharges", "TotalCharges"]].copy()
df_q2.head(2)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,1,29.85,29.85
1,34,56.95,1889.5


In [22]:
13 // 5, 7 // 2

(2, 3)

In [23]:
df_q2["month"] = df_q2["TotalCharges"] // df_q2["MonthlyCharges"]
df_q2.head(2)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,month
0,1,29.85,29.85,1.0
1,34,56.95,1889.5,33.0


In [24]:
df_q2.iloc[:, [0, 1, 3]].corr().round(3)

Unnamed: 0,tenure,MonthlyCharges,month
tenure,1.0,0.247,0.999
MonthlyCharges,0.247,1.0,0.246
month,0.999,0.246,1.0


In [25]:
df_corr = df_q2.iloc[:, [0, 1, 3]].corr().reset_index()
df_corr

Unnamed: 0,index,tenure,MonthlyCharges,month
0,tenure,1.0,0.246862,0.998831
1,MonthlyCharges,0.246862,1.0,0.246164
2,month,0.998831,0.246164,1.0


In [26]:
df_corr_melt = df_corr.melt(id_vars = "index")
df_corr_melt = df_corr_melt.loc[df_corr_melt["index"] != df_corr_melt["variable"], ]
df_corr_melt.head(2)

Unnamed: 0,index,variable,value
1,MonthlyCharges,tenure,0.246862
2,month,tenure,0.998831


In [27]:
df_corr_melt.pivot_table(index = "index", columns = "variable", values = "value",
                         aggfunc = "max")

variable,MonthlyCharges,month,tenure
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MonthlyCharges,,0.246164,0.246862
month,0.246164,,0.998831
tenure,0.246862,0.998831,


In [28]:
aa = df_q2.iloc[:, [0, 1, 3]].corr()

for n in range(len(aa)):
    aa.iloc[n, n] = np.nan
    
aa

Unnamed: 0,tenure,MonthlyCharges,month
tenure,,0.246862,0.998831
MonthlyCharges,0.246862,,0.246164
month,0.998831,0.246164,


### Q3.

In [29]:
col1 = ["SeniorCitizen", "Partner", "Dependents", "tenure", "MonthlyCharges", "TotalCharges"]
col2 = ["OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingMovies", "PaperlessBilling"]
df_q3 = df[["Churn"] + col1 + col2].copy()

In [30]:
df_q3.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,No,0,Yes,No,1,29.85,29.85,No,Yes,No,No,No,Yes
1,No,0,No,No,34,56.95,1889.5,Yes,No,Yes,No,No,No


In [31]:
df_q3 = df_q3.replace({"Yes": 1, "No": 0})
df_q3.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,0,0,1,0,1,29.85,29.85,0,1,0,0,0,1
1,0,0,0,0,34,56.95,1889.5,1,0,1,0,0,0


In [32]:
df_q3.dtypes

Churn                 int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
MonthlyCharges      float64
TotalCharges        float64
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingMovies      object
PaperlessBilling      int64
dtype: object

In [33]:
df_q3.loc[:, "OnlineSecurity":"StreamingMovies"].apply(lambda x: x.unique())

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies
0,0,1,0,0,0
1,1,0,1,1,1
2,No internet service,No internet service,No internet service,No internet service,No internet service


In [34]:
df_q3_dum = df_q3.select_dtypes(exclude = "number") # 시험버전에서 동작안함.
df_q3_dum.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies
0,0,1,0,0,0
1,1,0,1,0,0


In [35]:
ser_d = df_q3.dtypes
df_q3_dum = df_q3[ser_d[ser_d == "object"].index]
df_q3_dum.head(2)

Unnamed: 0,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies
0,0,1,0,0,0
1,1,0,1,0,0


In [36]:
df_q3 = df_q3.replace({"No internet service": -1})
df_q3.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,0,0,1,0,1,29.85,29.85,0,1,0,0,0,1
1,0,0,0,0,34,56.95,1889.5,1,0,1,0,0,0


In [37]:
df_train, df_test = train_test_split(df_q3, train_size = 0.7,
                                     random_state = 123)
len(df_train), len(df_test)

(4922, 2110)

In [38]:
model_nor = MinMaxScaler().fit(df_train)
arr_train_nor = model_nor.transform(df_train)
arr_test_nor  = model_nor.transform(df_test)

In [39]:
arr_train_nor[:1, ]

array([[1.        , 0.        , 0.        , 0.        , 0.08450704,
        0.81116094, 0.07551927, 0.5       , 1.        , 0.5       ,
        0.5       , 1.        , 1.        ]])

In [40]:
# 만약 DataFrame 으로 핸들링 하고싶은 경우
df_train_nor = pd.DataFrame(arr_train_nor, columns = df_train.columns)
df_train_nor.head(2)

Unnamed: 0,Churn,SeniorCitizen,Partner,Dependents,tenure,MonthlyCharges,TotalCharges,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingMovies,PaperlessBilling
0,1.0,0.0,0.0,0.0,0.084507,0.811161,0.075519,0.5,1.0,0.5,0.5,1.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.607374,0.006987,0.5,0.5,0.5,0.5,0.5,1.0


In [41]:
model_lr = LogisticRegression(random_state = 123)
model_lr.fit(X = arr_train_nor[:, 1:],
             y = arr_train_nor[:,  0])
pred = model_lr.predict(arr_test_nor[:, 1:])

In [42]:
pred

array([0., 0., 0., ..., 0., 0., 0.])

In [46]:
round(f1_score(y_true = arr_test_nor[:, 0],
               y_pred = pred), 2)

0.55

#### 예외 처리를 하는 범주가 n 개 있을 때 대응 방법.

In [48]:
df_dia = pd.read_csv("../diamonds.csv")
df_dia.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31


In [49]:
df_dia["color"].unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

"F"와 "G"를 제외한 나머지 모든 범주를 -1로 바꿔야 한다면!??! ~(재시험)~

In [50]:
df_dia_cat = df_dia[["cut", "color", "clarity"]]
df_dia_cat.head(2)

Unnamed: 0,cut,color,clarity
0,Ideal,E,SI2
1,Premium,E,SI1


In [54]:
# .explode(): 리스트를 해체시켜 내부 원소를 쫙~~ 나열
ser_u = df_dia_cat.apply(lambda x: x.unique()).explode()
ser_u

cut            Ideal
cut          Premium
cut             Good
cut        Very Good
cut             Fair
color              E
color              I
color              J
color              H
color              F
color              G
color              D
clarity          SI2
clarity          SI1
clarity          VS1
clarity          VS2
clarity         VVS2
clarity         VVS1
clarity           I1
clarity           IF
dtype: object

In [55]:
np.where(ser_u.isin(["F", "G"]), ser_u, -1)

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, 'F', 'G', -1, -1, -1, -1, -1,
       -1, -1, -1, -1], dtype=object)

In [57]:
ser_repl = pd.Series(np.where(ser_u.isin(["F", "G"]), ser_u, -1),
                     index = ser_u)
ser_repl.to_dict()

{'Ideal': -1,
 'Premium': -1,
 'Good': -1,
 'Very Good': -1,
 'Fair': -1,
 'E': -1,
 'I': -1,
 'J': -1,
 'H': -1,
 'F': 'F',
 'G': 'G',
 'D': -1,
 'SI2': -1,
 'SI1': -1,
 'VS1': -1,
 'VS2': -1,
 'VVS2': -1,
 'VVS1': -1,
 'I1': -1,
 'IF': -1}

In [58]:
df_dia2 = df_dia.replace(ser_repl)

In [59]:
df_dia2.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,-1,-1,-1,61.5,55.0,326,3.95,3.98,2.43
1,0.21,-1,-1,-1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,-1,-1,-1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,-1,-1,-1,62.4,58.0,334,4.2,4.23,2.63
4,0.31,-1,-1,-1,63.3,58.0,335,4.34,4.35,2.75


In [60]:
df_dia2.apply(lambda x: x.unique())

carat      [0.23, 0.21, 0.29, 0.31, 0.24, 0.26, 0.22, 0.3...
cut                                                     [-1]
color                                             [-1, F, G]
clarity                                                 [-1]
depth      [61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61....
table      [55.0, 61.0, 65.0, 58.0, 57.0, 56.0, 54.0, 62....
price      [326, 327, 334, 335, 336, 337, 338, 339, 340, ...
x          [3.95, 3.89, 4.05, 4.2, 4.34, 3.94, 4.07, 3.87...
y          [3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 4.11, 3.7...
z          [2.43, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.4...
dtype: object