In [123]:
import pandas as pd
import numpy as np

from scipy.stats import ttest_1samp, shapiro

In [124]:
# 100명의 키 정보가 들어 있는 데이터가 있다.
# 데이터가 정규성을 만족하는지 확인하라
# 그리고 평균키는 165라 판단할 수 있는지 귀무가설과 대립가설을 설정한 후 유의수준 5%로 검정하라

# 귀무가설(H0) : 평균키 = 165
# 대립가설(H1) : 평균키 <> 165

# 데이터
df = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/scipy/height1.csv")

df.head()

Unnamed: 0,height
0,160.237691
1,164.747324
2,165.401628
3,168.801627
4,153.199021


In [125]:
# 정규성 확인

statistic, pvalue = shapiro(df)

print(pvalue)

0.4558176100254059


In [126]:
# 정규성 만족 -> ttest_1samp

statistic, pvalue = ttest_1samp(df, popmean=165, alternative='two-sided')

print(round(pvalue[0],4))

0.0018


In [127]:
# 등분산 검정

df = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/scipy/scipy2.csv')

df.head()

Unnamed: 0,class,score
0,A,84
1,A,59
2,A,49
3,A,57
4,A,82


In [128]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480 entries, 0 to 479
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   class   480 non-null    object
 1   score   480 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 7.6+ KB


In [129]:
df['class'].value_counts()

A    300
B    180
Name: class, dtype: int64

In [130]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])

cond_a = (df['class'] == 0)
cond_b = (df['class'] == 1)

df_a = df[cond_a]
df_b = df[cond_b]

In [131]:
# 정규성 확인


a_statistic, a_pvalue = shapiro(df_a['score'])
b_statistic, b_pvalue = shapiro(df_b['score'])

print(a_pvalue, b_pvalue) # 정규성을 가지지 않는다

4.097050521068013e-08 1.1735706948456937e-06


In [132]:
from scipy.stats import levene

eq_statistic, eq_pvalue = levene(df_a['score'], df_b['score'])

print(eq_statistic, eq_pvalue)

0.3145466542912649 0.5751662820554713


In [133]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score

In [134]:
x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_train.csv")
x_test = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_test.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_train.csv")


print(x_train.shape, x_test.shape, y_train.shape)

(6499, 12) (3501, 12) (6499, 2)


In [135]:
print(x_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6499 entries, 0 to 6498
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       6499 non-null   int64  
 1   Surname          6499 non-null   object 
 2   CreditScore      6499 non-null   int64  
 3   Geography        6499 non-null   object 
 4   Gender           6499 non-null   object 
 5   Age              6499 non-null   int64  
 6   Tenure           6499 non-null   int64  
 7   Balance          6499 non-null   float64
 8   NumOfProducts    6499 non-null   int64  
 9   HasCrCard        6499 non-null   int64  
 10  IsActiveMember   6499 non-null   int64  
 11  EstimatedSalary  6499 non-null   float64
dtypes: float64(2), int64(7), object(3)
memory usage: 609.4+ KB
None


In [136]:
print(x_test.info())
cust_id = x_test['CustomerId']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3501 entries, 0 to 3500
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       3501 non-null   int64  
 1   Surname          3501 non-null   object 
 2   CreditScore      3501 non-null   int64  
 3   Geography        3501 non-null   object 
 4   Gender           3501 non-null   object 
 5   Age              3501 non-null   int64  
 6   Tenure           3501 non-null   int64  
 7   Balance          3501 non-null   float64
 8   NumOfProducts    3501 non-null   int64  
 9   HasCrCard        3501 non-null   int64  
 10  IsActiveMember   3501 non-null   int64  
 11  EstimatedSalary  3501 non-null   float64
dtypes: float64(2), int64(7), object(3)
memory usage: 328.3+ KB
None


In [137]:
print(y_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6499 entries, 0 to 6498
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   CustomerId  6499 non-null   int64
 1   Exited      6499 non-null   int64
dtypes: int64(2)
memory usage: 101.7 KB
None


In [138]:
print(x_train.isnull().sum())


CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64


In [139]:
print(x_test.isnull().sum())

CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64


In [140]:
print(y_train.isnull().sum())

CustomerId    0
Exited        0
dtype: int64


In [141]:
print(x_train.describe().T)
print(x_test.describe().T)

                  count          mean           std          min  \
CustomerId       6499.0  1.569157e+07  71875.840384  15565701.00   
CreditScore      6499.0  6.503968e+02     96.618957       350.00   
Age              6499.0  3.895707e+01     10.502803        18.00   
Tenure           6499.0  5.041545e+00      2.891779         0.00   
Balance          6499.0  7.683658e+04  62407.570894         0.00   
NumOfProducts    6499.0  1.519772e+00      0.578975         1.00   
HasCrCard        6499.0  7.088783e-01      0.454314         0.00   
IsActiveMember   6499.0  5.143868e-01      0.499831         0.00   
EstimatedSalary  6499.0  1.003466e+05  57944.655305        11.58   

                          25%          50%           75%          max  
CustomerId       1.562949e+07  15691808.00  1.575358e+07  15815660.00  
CreditScore      5.840000e+02       651.00  7.180000e+02       850.00  
Age              3.200000e+01        37.00  4.400000e+01        92.00  
Tenure           3.000000e+00  

In [142]:
# Surname, Geography, Gender

le = LabelEncoder()

x_total = pd.concat([x_train, x_test])

x_total['Surname'] = le.fit_transform(x_total['Surname'])
x_total['Geography'] = le.fit_transform(x_total['Geography'])
x_total['Gender'] = le.fit_transform(x_total['Gender'])

x_train = x_total.iloc[:6499, :]
x_test = x_total.iloc[6499:, :]

# x_total.shape

print(x_train.shape, x_test.shape)

print(x_train.head(3))
print(x_test.head(3))

(6499, 12) (3501, 12)
   CustomerId  Surname  CreditScore  Geography  Gender  Age  Tenure  \
0    15799217     2913          791          1       1   35       7   
1    15748986      256          705          1       2   42       8   
2    15722004     1234          543          0       1   31       4   

     Balance  NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  
0   52436.20              1          1               0        161051.75  
1  166685.92              2          1               1         55313.51  
2  138317.94              1          0               0         61843.73  
   CustomerId  Surname  CreditScore  Geography  Gender  Age  Tenure  \
0    15601012        3          802          0       1   60       3   
1    15734762     1275          602          0       1   56       3   
2    15586757       87          801          0       1   32       4   

     Balance  NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  
0   92887.06              1          1

In [143]:
print(x_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6499 entries, 0 to 6498
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       6499 non-null   int64  
 1   Surname          6499 non-null   int32  
 2   CreditScore      6499 non-null   int64  
 3   Geography        6499 non-null   int32  
 4   Gender           6499 non-null   int32  
 5   Age              6499 non-null   int64  
 6   Tenure           6499 non-null   int64  
 7   Balance          6499 non-null   float64
 8   NumOfProducts    6499 non-null   int64  
 9   HasCrCard        6499 non-null   int64  
 10  IsActiveMember   6499 non-null   int64  
 11  EstimatedSalary  6499 non-null   float64
dtypes: float64(2), int32(3), int64(7)
memory usage: 583.9 KB
None


In [144]:
print(x_test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3501 entries, 0 to 3500
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       3501 non-null   int64  
 1   Surname          3501 non-null   int32  
 2   CreditScore      3501 non-null   int64  
 3   Geography        3501 non-null   int32  
 4   Gender           3501 non-null   int32  
 5   Age              3501 non-null   int64  
 6   Tenure           3501 non-null   int64  
 7   Balance          3501 non-null   float64
 8   NumOfProducts    3501 non-null   int64  
 9   HasCrCard        3501 non-null   int64  
 10  IsActiveMember   3501 non-null   int64  
 11  EstimatedSalary  3501 non-null   float64
dtypes: float64(2), int32(3), int64(7)
memory usage: 314.5 KB
None


In [145]:
model = RandomForestClassifier()

model.fit(x_train, y_train['Exited'])

pred = model.predict(x_test)

In [146]:
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train['Exited'], stratify=y_train['Exited'], test_size=0.2)

val_model = RandomForestClassifier()
val_model.fit(X_train, Y_train)

val_pred = val_model.predict(X_val)

In [147]:
accuracy = accuracy_score(Y_val, val_pred)
roc_auc = roc_auc_score(Y_val, val_pred)

print(accuracy, roc_auc)

0.8592307692307692 0.7108650077476985


In [148]:
# 결과값 제출

result = pd.DataFrame({'cust_id':cust_id, 'pred':pred})

result.to_csv("result.csv", index=False)

In [149]:
res = pd.read_csv("result.csv")

print(res.shape)
print(res.head(3))

(3501, 2)
    cust_id  pred
0  15601012     1
1  15734762     1
2  15586757     0
