In [103]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, classification_report

In [4]:
! ls files_for_lab/

categorical.csv numerical.csv   target.csv


In [43]:
## Import data
path = 'files_for_lab/'
categorical = pd.read_csv(path + 'categorical.csv')
categorical.drop('RFA_2R', axis=1, inplace=True) # This attribute is the same for all entrie
numerical = pd.read_csv(path + 'numerical.csv')
target = pd.read_csv(path + 'target.csv')

In [7]:
categorical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   STATE         95412 non-null  object
 1   CLUSTER       95412 non-null  int64 
 2   HOMEOWNR      95412 non-null  object
 3   GENDER        95412 non-null  object
 4   DATASRCE      95412 non-null  int64 
 5   RFA_2R        95412 non-null  object
 6   RFA_2A        95412 non-null  object
 7   GEOCODE2      95412 non-null  object
 8   DOMAIN_A      95412 non-null  object
 9   DOMAIN_B      95412 non-null  int64 
 10  ODATEW_YR     95412 non-null  int64 
 11  ODATEW_MM     95412 non-null  int64 
 12  DOB_YR        95412 non-null  int64 
 13  DOB_MM        95412 non-null  int64 
 14  MINRDATE_YR   95412 non-null  int64 
 15  MINRDATE_MM   95412 non-null  int64 
 16  MAXRDATE_YR   95412 non-null  int64 
 17  MAXRDATE_MM   95412 non-null  int64 
 18  LASTDATE_YR   95412 non-null  int64 
 19  LAST

For a **categorical** data there are a lot of interger numbers. 

In [12]:
int_categorical = categorical.select_dtypes(include=int)
print(int_categorical.head())
print(len(int_categorical.columns))

   CLUSTER  DATASRCE  DOMAIN_B  ODATEW_YR  ODATEW_MM  DOB_YR  DOB_MM  \
0       36         3         2         89          1      37      12   
1       14         3         1         94          1      52       2   
2       43         3         2         90          1       0       2   
3       44         3         2         87          1      28       1   
4       16         3         2         86          1      20       1   

   MINRDATE_YR  MINRDATE_MM  MAXRDATE_YR  MAXRDATE_MM  LASTDATE_YR  \
0           92            8           94            2           95   
1           93           10           95           12           95   
2           91           11           92            7           95   
3           87           11           94           11           95   
4           93           10           96            1           96   

   LASTDATE_MM  FIRSTDATE_YR  FIRSTDATE_MM  
0           12            89            11  
1           12            93            10  
2          

In [14]:
cat_categorical = categorical.select_dtypes(exclude=int)
print(cat_categorical.head())
print(len(cat_categorical.columns))

  STATE HOMEOWNR GENDER RFA_2R RFA_2A GEOCODE2 DOMAIN_A
0    IL        H      F      L      E        C        T
1    CA        H      M      L      G        A        S
2    NC        U      M      L      E        C        R
3    CA        U      F      L      E        C        R
4    FL        H      F      L      F        A        S
7


In [15]:
print(len(categorical.columns))

22


In [17]:
cat_categorical.nunique()

STATE       12
HOMEOWNR     2
GENDER       3
RFA_2R       1
RFA_2A       4
GEOCODE2     4
DOMAIN_A     5
dtype: int64

In [50]:
print(cat_categorical['STATE'].unique())
print(cat_categorical['STATE'].value_counts())
# This feature is already cleaned.

['IL' 'CA' 'NC' 'FL' 'other' 'IN' 'MI' 'MO' 'TX' 'WA' 'WI' 'GA']
other    30457
CA       17343
FL        8376
TX        7535
IL        6420
MI        5654
NC        4160
WA        3577
GA        3403
IN        2980
WI        2795
MO        2712
Name: STATE, dtype: int64


In [51]:
int_categorical.nunique()

CLUSTER         53
DATASRCE         3
DOMAIN_B         4
ODATEW_YR       15
ODATEW_MM       12
DOB_YR          96
DOB_MM          12
MINRDATE_YR     20
MINRDATE_MM     12
MAXRDATE_YR     18
MAXRDATE_MM     12
LASTDATE_YR      3
LASTDATE_MM     12
FIRSTDATE_YR    26
FIRSTDATE_MM    12
dtype: int64

In [54]:
encoder = OneHotEncoder(drop="first").fit(categorical)
categorical = encoder.transform(categorical)

In [82]:
for column in target.columns:
    print(target[column].value_counts())

0    90569
1     4843
Name: TARGET_B, dtype: int64
0.00     90569
10.00      941
15.00      591
20.00      577
5.00       503
         ...  
4.50         1
55.00        1
18.25        1
16.87        1
48.00        1
Name: TARGET_D, Length: 71, dtype: int64


In [87]:
X = np.concatenate([np.array(numerical), np.array(categorical.todense())], axis=1)
y = target['TARGET_B']

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [93]:
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X, y)

In [95]:
y_train.value_counts()

0    90569
1    90569
Name: TARGET_B, dtype: int64

In [97]:
clf = RandomForestClassifier(n_estimators=50, min_samples_split=2, min_samples_leaf=2).fit(X_train,y_train)

In [99]:
prediction = clf.predict(X_test)

In [102]:
print(confusion_matrix(prediction, y_test))

[[29902   335]
 [    0  1249]]


In [104]:
print(classification_report(prediction, y_test))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     30237
           1       0.79      1.00      0.88      1249

    accuracy                           0.99     31486
   macro avg       0.89      0.99      0.94     31486
weighted avg       0.99      0.99      0.99     31486

