In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing

In [2]:
housing = fetch_california_housing()
housing
X = pd.DataFrame(housing['data'], columns = housing['feature_names'])
y = pd.DataFrame(housing['target'], columns = ['labels'])
display(X.head())
display(y.head())

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


Unnamed: 0,labels
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


In [3]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

model = DecisionTreeRegressor()

X.drop(columns=["Latitude","Longitude"], axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

scores = cross_val_score(model, X_train, y_train, cv=5)
scores


array([0.34596247, 0.36114884, 0.34890965, 0.38774453, 0.39851341])

In [4]:
mean_score = scores.mean()
print(mean_score)

0.36845577875567903


# Comparing several models with CV

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np

model1 = DecisionTreeRegressor()
model2 = LinearRegression() # y = b0 + b1 * x1 + b2 * x2 +.....
model3 = KNeighborsRegressor() # weights = "uniform","distance"

# data should really be scaled here
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled_np = scaler.transform(X_train)
X_test_scaled_np  = scaler.transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled_np)
X_test_scaled_df  = pd.DataFrame(X_test_scaled_np)
#...

model_pipeline = [model1, model2, model3]
model_names = ['Decision Tree Regressor', 'Linear Regression', 'KNN']

scores = {}

for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train_scaled_df, y_train, cv=5))
    scores[model_name] = mean_score
print(scores)

# We can use the result to choose the best performing model

{'Decision Tree Regressor': 0.3660456417572856, 'Linear Regression': 0.5238229411465707, 'KNN': 0.5680150673353677}


In [6]:
list(zip(model_pipeline, model_names))

[(DecisionTreeRegressor(), 'Decision Tree Regressor'),
 (LinearRegression(), 'Linear Regression'),
 (KNeighborsRegressor(), 'KNN')]

The previous cross validation analysis has shown that **on average** the most performant model (using default hyperparameters) is the K-NN. However, the **average** doesn't represent the performance of each model in one single dataset cohsen at random (remember that we're using an average). Let's see this.

In [7]:
val_scores = {}

for model, model_name in zip(model_pipeline,model_names):
    model.fit(X_train_scaled_df, y_train)
    val_scores[model_name] = model.score(X_test_scaled_df,y_test)
print(val_scores)

{'Decision Tree Regressor': 0.38521981962174245, 'Linear Regression': 0.5554317214693567, 'KNN': 0.6018887282055191}


As we can see, in the dataset used the performance of the K-NN model is slightly better than the average value obtained using cross validation. However, this will not be allways the case. Sometimes, we can obtain performances slightly worse than the cross valiadation average. The averages obtained using cross validation are usually a decent estimation of the performance that we can expect for a given model.

# Activity 1

In this exercise we will go back to the customer churn data from the last lab (the dataset can also be found in the files_for_activities folder)

Implement cross validation along with logistic regression and decision tree classifier on the data

Create a pipeline as shown in the class example
You can use the following code to set up for this activity.
Finish the pipeline.

# Answer Activity 1

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

churnData = pd.read_csv('/content/drive/MyDrive/CURR-v3.X-MAR2023/UNIT7/DAY3/7.07 Bootstrapping, bagging, RandomForest/Customer-Churn.csv')
churnData['TotalCharges']  = pd.to_numeric(churnData['TotalCharges'], errors='coerce')
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(np.mean(churnData['TotalCharges']))

churnData['Churn']
churnData['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [11]:
churn_no = churnData[['tenure', 'SeniorCitizen','MonthlyCharges', 'TotalCharges', 'Churn']][churnData['Churn']=='No']
churn_yes = churnData[['tenure', 'SeniorCitizen','MonthlyCharges', 'TotalCharges', 'Churn']][churnData['Churn']=='Yes']
churn_no_downsample = churn_no.sample(len(churn_yes))

In [12]:
print(churn_yes.shape)
print(churn_no_downsample.shape)

(1869, 5)
(1869, 5)


In [13]:
from sklearn.preprocessing import StandardScaler

churndf = pd.concat([churn_no_downsample, churn_yes], axis=0)
X = churndf[['tenure', 'SeniorCitizen','MonthlyCharges', 'TotalCharges']]
y = churndf['Churn']

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [14]:
X

array([[ 1.56942685, -0.48633323, -1.67231494, -0.33662469],
       [ 0.73088572, -0.48633323, -0.50574382,  0.12787321],
       [-1.07197769, -0.48633323,  0.08278083, -0.87755545],
       ...,
       [-1.11390475,  2.0562033 ,  0.27837359, -0.89979341],
       [ 1.65328096, -0.48633323,  1.22839558,  2.25216495],
       [-0.98812358,  2.0562033 ,  0.23122176, -0.79295403]])

In [15]:
#complete the code here

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

logreg_model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr')
tree_model   = DecisionTreeClassifier()

import numpy as np
model_pipeline = [logreg_model, tree_model]
model_names = ['Logistic Regression', 'Decision Tree Classifier']
scores = {}

for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=10))
    scores[model_name] = mean_score
print(scores)

{'Logistic Regression': 0.7424031872509961, 'Decision Tree Classifier': 0.6637179282868526}


## end Activity 1

# Activity 2
- What are the advantages and disadvantages of using bootstrap method? Here is an [external resource](https://blog.paperspace.com/bagging-ensemble-methods/).
- What are the advantages and disadvantages of bagging?

# Answer Activity 2
### Bootstrap

- Advantages

  - Helps generate more samples from the original data. This is helpful when it is not possible to gather enough data from a process or the costs involved with taking new samples is high.
  - For sampling new data, it does not depend on the assumptions of underlying parametric distribution in the data.
  - Reduces variability in the data.

- Disadvantages

  - Depends on representative sample which could be good as well as bad. For eg.,if the original data has extreme values, bootstrap method will reduce the frequency of appearance of such values in the overall data. If those were values were actually outliers, then it is a good thing as we undermining the importance of such data points in the overall data at the end. However, if those data points are some rare important observations, then this could be bad.
  - The method does not work very well when the original sample size is too small.
  
---

### Bagging

- Advantages

  - Prevents over-fitting.
  - Different models are built independent of each other and equal weight is given to all the models.
  - The concept of "wisdom of crowds" which essentially means that the knowledge of a group of people is higher than knowledge of people independently.

- Disadvantages

  - Loss of interpretability; The final output is not interpretable like a decision where we can clearly see the decision spaces.
  - Computation complexity of the model increases.
  - It works well when the base classifier is good. If the base classifier is bad, then it can significantly decrease the performance.

## end Activity 2

# Activity 3

- Go through the documentation of random forests in `sklearn`, and get familiarized with the various parameters that can be used.
- There is another advanced concept called boosting. Conduct some elementary research on boosting and compare it with bagging.

# Answer Activity 3

The documentation: [here](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html).<br>
Boosting: detailed article [here](https://dataaspirant.com/ensemble-methods-bagging-vs-boosting-difference/)

# Random Forest

In [16]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split


numerical = pd.read_csv('/content/drive/MyDrive/CURR-v3.X-MAR2023/UNIT7/DAY1/7.01, 7.02 Data Cleaning and Feature selection review/numerical7_02.csv')
categorical = pd.read_csv('/content/drive/MyDrive/CURR-v3.X-MAR2023/UNIT7/DAY1/7.01, 7.02 Data Cleaning and Feature selection review/categorical7_02.csv')
targets = pd.read_csv('/content/drive/MyDrive/CURR-v3.X-MAR2023/UNIT7/DAY1/7.01, 7.02 Data Cleaning and Feature selection review/target7_02.csv')
data = pd.concat([numerical, categorical, targets], axis = 1)
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [17]:
data.drop(columns=['Unnamed: 0'], axis = 1, inplace = True)

In [18]:
data.head()

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,FEDGOV,WEALTH2,POP901,POP902,POP903,POP90C1,POP90C2,POP90C3,POP90C4,POP90C5,ETH1,ETH2,ETH3,ETH4,ETH5,ETH6,ETH7,ETH8,ETH9,ETH10,ETH11,ETH12,ETH13,ETH14,ETH15,ETH16,AGE901,AGE902,AGE903,AGE904,AGE905,AGE906,AGE907,CHIL1,CHIL2,CHIL3,AGEC1,AGEC2,AGEC3,AGEC4,AGEC5,AGEC6,AGEC7,CHILC1,CHILC2,CHILC3,CHILC4,CHILC5,HHAGE1,HHAGE2,HHAGE3,HHN1,HHN2,HHN3,HHN4,HHN5,HHN6,MARR1,MARR2,MARR3,MARR4,HHP1,HHP2,DW1,DW2,DW3,DW4,DW5,DW6,DW7,DW8,DW9,HV1,HV2,HV3,HV4,HU1,HU2,HU3,HU4,HU5,HHD1,HHD2,HHD3,HHD4,HHD5,HHD6,HHD7,HHD8,HHD9,HHD10,HHD11,HHD12,ETHC1,ETHC2,ETHC3,ETHC4,ETHC5,ETHC6,HVP1,HVP2,HVP3,HVP4,HVP5,HVP6,HUR1,HUR2,RHP1,RHP2,RHP3,RHP4,HUPA1,HUPA2,HUPA3,HUPA4,HUPA5,HUPA6,HUPA7,RP1,RP2,RP3,RP4,MSA,ADI,DMA,IC1,IC2,IC3,IC4,IC5,IC6,IC7,IC8,IC9,IC10,IC11,IC12,IC13,IC14,IC15,IC16,IC17,IC18,IC19,IC20,IC21,IC22,IC23,HHAS1,HHAS2,HHAS3,HHAS4,MC1,MC2,MC3,TPE1,TPE2,TPE3,TPE4,TPE5,TPE6,TPE7,TPE8,TPE9,PEC1,PEC2,TPE10,TPE11,TPE12,TPE13,LFC1,LFC2,LFC3,LFC4,LFC5,LFC6,LFC7,LFC8,LFC9,LFC10,OCC1,OCC2,OCC3,OCC4,OCC5,OCC6,OCC7,OCC8,OCC9,OCC10,OCC11,OCC12,OCC13,EIC1,EIC2,EIC3,EIC4,EIC5,EIC6,EIC7,EIC8,EIC9,EIC10,EIC11,EIC12,EIC13,EIC14,EIC15,EIC16,OEDC1,OEDC2,OEDC3,OEDC4,OEDC5,OEDC6,OEDC7,EC1,EC2,EC3,EC4,EC5,EC6,EC7,EC8,SEC1,SEC2,SEC3,SEC4,SEC5,AFC1,AFC2,AFC3,AFC4,AFC5,AFC6,VC1,VC2,VC3,VC4,ANC1,ANC2,ANC3,ANC4,ANC5,ANC6,ANC7,ANC8,ANC9,ANC10,ANC11,ANC12,ANC13,ANC14,ANC15,POBC1,POBC2,LSC1,LSC2,LSC3,LSC4,VOC1,VOC2,VOC3,HC1,HC2,HC3,HC4,HC5,HC6,HC7,HC8,HC9,HC10,HC11,HC12,HC13,HC14,HC15,HC16,HC17,HC18,HC19,HC20,HC21,MHUC1,MHUC2,AC1,AC2,CARDPROM,MAXADATE,NUMPROM,CARDPM12,NUMPRM12,RAMNTALL,NGIFTALL,CARDGIFT,MINRAMNT,MINRDATE,MAXRAMNT,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,NEXTDATE,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,OSOURCE,STATE,ZIP,MAILCODE,NOEXCH,MDMAUD,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,TARGET_B,TARGET_D
0,9401.0,1.0,5202.0,46.0,6.0,9.0,16.0,0.0,15.0,55.0,11.0,6.0,2.0,1.0,9.0,3611.0,940.0,998.0,99.0,0.0,0.0,50.0,50.0,67.0,0.0,0.0,31.0,6.0,4.0,2.0,6.0,4.0,14.0,0.0,0.0,2.0,0.0,1.0,4.0,34.0,41.0,43.0,32.0,42.0,45.0,32.0,33.0,46.0,21.0,13.0,14.0,33.0,23.0,10.0,4.0,2.0,11.0,16.0,36.0,22.0,15.0,12.0,1.0,5.0,4.0,21.0,75.0,55.0,23.0,9.0,69.0,4.0,3.0,24.0,317.0,360.0,99.0,99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5468.0,5218.0,12.0,10.0,96.0,4.0,97.0,3.0,9.0,59.0,94.0,88.0,55.0,95.0,5.0,4.0,1.0,3.0,5.0,4.0,2.0,18.0,44.0,5.0,0.0,0.0,0.0,97.0,98.0,98.0,98.0,99.0,94.0,0.0,83.0,76.0,73.0,21.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,91.0,91.0,91.0,94.0,4480.0,13.0,803.0,1088.0,1096.0,1026.0,1037.0,36175.0,2.0,6.0,2.0,5.0,15.0,14.0,13.0,10.0,33.0,2.0,5.0,2.0,5.0,15.0,14.0,14.0,10.0,32.0,6.0,2.0,66.0,3.0,56.0,44.0,9.0,80.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,2.0,24.0,32.0,12.0,71.0,70.0,83.0,58.0,81.0,57.0,64.0,57.0,99.0,99.0,0.0,22.0,24.0,4.0,21.0,13.0,2.0,1.0,6.0,0.0,4.0,1.0,0.0,3.0,1.0,0.0,6.0,13.0,1.0,2.0,8.0,18.0,11.0,4.0,3.0,4.0,10.0,7.0,11.0,1.0,6.0,2.0,1.0,16.0,69.0,5.0,2.0,160.0,5.0,5.0,12.0,21.0,7.0,30.0,20.0,14.0,24.0,4.0,24.0,10.0,0.0,0.0,0.0,8.0,15.0,0.0,55.0,10.0,11.0,0.0,0.0,2.0,0.0,3.0,1.0,1.0,2.0,3.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,42.0,39.0,50.0,7.0,27.0,16.0,99.0,92.0,53.0,5.0,10.0,2.0,26.0,56.0,97.0,99.0,0.0,0.0,0.0,96.0,0.0,4.0,0.0,0.0,0.0,99.0,0.0,99.0,99.0,99.0,20.0,4.0,6.0,5.0,12.0,9702.0,32.0,6.0,13.0,47.0,3.0,1.0,10.0,9310.0,25.0,9512.0,25.0,9512.0,9310.0,9504.0,18.0,15.666667,148535.0,0.0,2.0,1.0,BOA,CA,91326,A,0,XXXX,14.0,H,M,3.0,L,G,A,S,1.0,0,0.0
1,9001.0,1.0,0.0,,3.0,1.0,2.0,0.0,20.0,29.0,33.0,6.0,8.0,1.0,1.0,7001.0,2040.0,2669.0,0.0,2.0,98.0,49.0,51.0,96.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,35.0,43.0,46.0,37.0,45.0,49.0,23.0,35.0,40.0,25.0,13.0,20.0,19.0,16.0,13.0,10.0,8.0,15.0,14.0,30.0,22.0,19.0,25.0,10.0,23.0,21.0,35.0,44.0,22.0,6.0,2.0,63.0,9.0,9.0,19.0,183.0,254.0,69.0,69.0,1.0,6.0,5.0,3.0,3.0,3.0,0.0,497.0,546.0,2.0,1.0,78.0,22.0,93.0,7.0,18.0,36.0,76.0,65.0,30.0,86.0,14.0,7.0,2.0,5.0,11.0,17.0,3.0,17.0,60.0,18.0,0.0,1.0,0.0,0.0,1.0,6.0,18.0,50.0,0.0,4.0,36.0,49.0,51.0,14.0,5.0,4.0,2.0,24.0,11.0,2.0,3.0,6.0,0.0,2.0,9.0,44.0,0.0,281.0,518.0,251.0,292.0,292.0,340.0,11576.0,32.0,18.0,20.0,15.0,12.0,2.0,0.0,0.0,1.0,20.0,19.0,24.0,18.0,16.0,2.0,0.0,0.0,1.0,28.0,8.0,31.0,11.0,38.0,62.0,8.0,74.0,22.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,1.0,21.0,19.0,24.0,6.0,61.0,65.0,73.0,59.0,70.0,56.0,78.0,62.0,82.0,99.0,4.0,10.0,5.0,2.0,6.0,12.0,0.0,1.0,9.0,5.0,18.0,20.0,5.0,7.0,6.0,0.0,11.0,33.0,4.0,3.0,2.0,12.0,3.0,3.0,2.0,0.0,7.0,8.0,3.0,3.0,6.0,7.0,1.0,8.0,74.0,3.0,1.0,120.0,22.0,20.0,28.0,16.0,6.0,5.0,3.0,1.0,23.0,1.0,16.0,6.0,0.0,0.0,0.0,10.0,21.0,0.0,28.0,23.0,32.0,8.0,1.0,14.0,1.0,5.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,84.0,96.0,3.0,0.0,0.0,92.0,65.0,29.0,9.0,22.0,3.0,12.0,23.0,50.0,69.0,31.0,0.0,0.0,0.0,6.0,35.0,44.0,0.0,15.0,22.0,77.0,17.0,97.0,92.0,9.0,2.0,6.0,5.0,26.0,9702.0,63.0,6.0,14.0,202.0,27.0,14.0,2.0,9111.0,16.0,9207.0,5.0,9512.0,9001.0,9101.0,12.0,7.481481,15078.0,1.0,4.0,60.0,AMH,NC,27017,A,0,XXXX,43.0,U,M,3.0,L,E,C,R,2.0,0,0.0
2,8701.0,0.0,2801.0,70.0,1.0,4.0,2.0,0.0,23.0,14.0,31.0,3.0,0.0,3.0,0.0,640.0,160.0,219.0,0.0,8.0,92.0,54.0,46.0,61.0,0.0,0.0,11.0,32.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,31.0,0.0,0.0,1.0,32.0,40.0,44.0,34.0,43.0,47.0,25.0,45.0,35.0,20.0,15.0,25.0,17.0,17.0,12.0,7.0,7.0,20.0,17.0,30.0,14.0,19.0,25.0,11.0,23.0,23.0,27.0,50.0,30.0,15.0,8.0,63.0,9.0,6.0,23.0,199.0,283.0,85.0,83.0,3.0,4.0,1.0,0.0,2.0,0.0,2.0,1000.0,1263.0,2.0,1.0,48.0,52.0,93.0,7.0,6.0,36.0,73.0,61.0,30.0,84.0,16.0,6.0,3.0,3.0,21.0,12.0,4.0,13.0,36.0,13.0,0.0,0.0,0.0,10.0,25.0,50.0,69.0,92.0,10.0,15.0,42.0,55.0,50.0,15.0,5.0,4.0,0.0,9.0,42.0,4.0,0.0,5.0,1.0,8.0,17.0,34.0,9340.0,67.0,862.0,386.0,388.0,396.0,423.0,15130.0,27.0,12.0,4.0,26.0,22.0,5.0,0.0,0.0,4.0,35.0,5.0,6.0,12.0,30.0,6.0,0.0,0.0,5.0,22.0,14.0,26.0,20.0,46.0,54.0,3.0,58.0,36.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,17.0,13.0,15.0,0.0,43.0,69.0,81.0,53.0,68.0,45.0,33.0,31.0,0.0,99.0,23.0,17.0,3.0,0.0,6.0,6.0,0.0,0.0,13.0,42.0,12.0,0.0,0.0,0.0,42.0,0.0,6.0,3.0,0.0,0.0,0.0,23.0,3.0,3.0,6.0,0.0,3.0,3.0,3.0,3.0,3.0,0.0,3.0,6.0,87.0,0.0,0.0,120.0,28.0,12.0,14.0,27.0,10.0,3.0,5.0,0.0,19.0,1.0,17.0,0.0,0.0,0.0,0.0,13.0,23.0,0.0,14.0,40.0,31.0,16.0,0.0,1.0,0.0,13.0,0.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,29.0,67.0,56.0,41.0,3.0,0.0,94.0,43.0,27.0,4.0,38.0,0.0,10.0,19.0,39.0,45.0,55.0,0.0,0.0,45.0,22.0,17.0,0.0,0.0,16.0,23.0,77.0,22.0,93.0,89.0,16.0,2.0,6.0,6.0,27.0,9702.0,66.0,6.0,14.0,109.0,16.0,7.0,2.0,8711.0,11.0,9411.0,10.0,9512.0,8702.0,8711.0,9.0,6.8125,172556.0,1.0,4.0,41.0,BRY,CA,95953,A,0,XXXX,44.0,U,F,3.0,L,E,C,R,2.0,0,0.0
3,8601.0,0.0,2001.0,78.0,3.0,2.0,60.0,1.0,28.0,9.0,53.0,26.0,3.0,2.0,,2520.0,627.0,761.0,99.0,0.0,0.0,46.0,54.0,2.0,98.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,45.0,50.0,36.0,46.0,50.0,27.0,34.0,43.0,23.0,14.0,21.0,13.0,15.0,20.0,12.0,5.0,13.0,15.0,34.0,19.0,19.0,31.0,7.0,27.0,16.0,26.0,57.0,36.0,24.0,14.0,42.0,17.0,9.0,33.0,235.0,323.0,99.0,98.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,576.0,594.0,4.0,3.0,90.0,10.0,97.0,3.0,0.0,42.0,82.0,49.0,22.0,92.0,8.0,20.0,3.0,17.0,9.0,23.0,1.0,1.0,1.0,0.0,21.0,58.0,19.0,0.0,1.0,2.0,16.0,67.0,0.0,2.0,45.0,52.0,53.0,16.0,6.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,25.0,58.0,74.0,83.0,5000.0,127.0,528.0,240.0,250.0,293.0,321.0,9836.0,24.0,29.0,23.0,13.0,4.0,4.0,0.0,0.0,2.0,21.0,30.0,22.0,16.0,4.0,5.0,0.0,0.0,3.0,35.0,8.0,11.0,14.0,20.0,80.0,4.0,73.0,22.0,1.0,1.0,0.0,0.0,0.0,3.0,1.0,2.0,1.0,24.0,27.0,3.0,76.0,61.0,73.0,51.0,65.0,49.0,80.0,31.0,81.0,99.0,10.0,17.0,8.0,2.0,6.0,15.0,3.0,7.0,22.0,2.0,9.0,0.0,7.0,2.0,2.0,0.0,6.0,1.0,5.0,2.0,2.0,12.0,2.0,7.0,6.0,4.0,15.0,29.0,4.0,3.0,26.0,3.0,2.0,7.0,49.0,12.0,1.0,120.0,16.0,20.0,30.0,13.0,3.0,12.0,5.0,2.0,26.0,1.0,20.0,7.0,1.0,1.0,1.0,15.0,28.0,4.0,9.0,16.0,53.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,65.0,99.0,0.0,0.0,0.0,90.0,45.0,18.0,25.0,34.0,0.0,1.0,3.0,6.0,33.0,67.0,0.0,0.0,9.0,14.0,72.0,3.0,0.0,0.0,99.0,1.0,21.0,99.0,96.0,6.0,2.0,7.0,11.0,43.0,9702.0,113.0,10.0,25.0,254.0,37.0,8.0,3.0,9310.0,15.0,9601.0,15.0,9601.0,7903.0,8005.0,14.0,6.864865,7112.0,1.0,2.0,26.0,,FL,33176,A,0,XXXX,16.0,H,F,3.0,L,F,A,S,2.0,0,0.0
4,8701.0,0.0,6001.0,38.0,4.0,6.0,0.0,0.0,33.0,36.0,34.0,7.0,1.0,1.0,3.0,1067.0,245.0,348.0,0.0,99.0,0.0,46.0,54.0,99.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,35.0,44.0,49.0,40.0,50.0,54.0,25.0,36.0,41.0,23.0,12.0,20.0,18.0,10.0,8.0,10.0,22.0,15.0,14.0,35.0,23.0,13.0,24.0,12.0,22.0,25.0,31.0,45.0,27.0,11.0,4.0,55.0,8.0,18.0,19.0,182.0,263.0,75.0,73.0,14.0,23.0,8.0,3.0,14.0,14.0,0.0,484.0,519.0,3.0,3.0,64.0,36.0,95.0,5.0,0.0,41.0,70.0,61.0,35.0,74.0,26.0,5.0,2.0,3.0,13.0,20.0,5.0,21.0,51.0,27.0,0.0,0.0,0.0,0.0,1.0,1.0,11.0,46.0,0.0,1.0,48.0,54.0,54.0,14.0,5.0,22.0,1.0,1.0,15.0,17.0,3.0,1.0,0.0,9.0,50.0,88.0,0.0,91.0,509.0,251.0,284.0,278.0,311.0,10717.0,18.0,31.0,25.0,20.0,4.0,1.0,0.0,0.0,0.0,10.0,29.0,27.0,27.0,6.0,2.0,0.0,0.0,0.0,31.0,6.0,40.0,3.0,55.0,45.0,14.0,87.0,10.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,3.0,18.0,9.0,14.0,1.0,21.0,54.0,65.0,46.0,61.0,45.0,70.0,53.0,99.0,99.0,0.0,6.0,8.0,3.0,14.0,19.0,0.0,0.0,12.0,0.0,15.0,19.0,3.0,1.0,0.0,0.0,5.0,39.0,0.0,1.0,9.0,13.0,5.0,3.0,0.0,3.0,9.0,8.0,4.0,0.0,10.0,0.0,0.0,4.0,82.0,5.0,0.0,120.0,9.0,17.0,44.0,17.0,7.0,3.0,3.0,4.0,13.0,2.0,11.0,4.0,1.0,1.0,1.0,14.0,31.0,0.0,31.0,15.0,48.0,11.0,1.0,3.0,0.0,18.0,0.0,1.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,77.0,99.0,0.0,1.0,1.0,99.0,56.0,17.0,5.0,35.0,12.0,14.0,19.0,30.0,45.0,55.0,3.0,6.0,59.0,2.0,39.0,0.0,0.0,0.0,99.0,0.0,97.0,99.0,94.0,5.0,2.0,3.0,4.0,26.0,9702.0,63.0,4.0,9.0,107.0,14.0,8.0,3.0,8705.0,12.0,9410.0,11.0,9504.0,8701.0,8705.0,4.0,7.642857,62117.0,1.0,1.0,53.0,DRK,IN,46755,A,0,XXXX,40.0,H,F,3.0,L,E,D,T,2.0,0,0.0


We still had lots of NA's even though the percentage was small.

In [19]:
data.isna().sum()

ODATEDW     44794
TCODE       44794
DOB         44794
AGE         51436
INCOME      44794
            ...  
GEOCODE2    44794
DOMAIN_A    44794
DOMAIN_B    44794
TARGET_B        0
TARGET_D        0
Length: 340, dtype: int64

In [None]:
# To simplify, let's drop any row with an NA on any column

In [20]:
data.dropna(axis= 0,inplace=True)

In [21]:
data.head()

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,FEDGOV,WEALTH2,POP901,POP902,POP903,POP90C1,POP90C2,POP90C3,POP90C4,POP90C5,ETH1,ETH2,ETH3,ETH4,ETH5,ETH6,ETH7,ETH8,ETH9,ETH10,ETH11,ETH12,ETH13,ETH14,ETH15,ETH16,AGE901,AGE902,AGE903,AGE904,AGE905,AGE906,AGE907,CHIL1,CHIL2,CHIL3,AGEC1,AGEC2,AGEC3,AGEC4,AGEC5,AGEC6,AGEC7,CHILC1,CHILC2,CHILC3,CHILC4,CHILC5,HHAGE1,HHAGE2,HHAGE3,HHN1,HHN2,HHN3,HHN4,HHN5,HHN6,MARR1,MARR2,MARR3,MARR4,HHP1,HHP2,DW1,DW2,DW3,DW4,DW5,DW6,DW7,DW8,DW9,HV1,HV2,HV3,HV4,HU1,HU2,HU3,HU4,HU5,HHD1,HHD2,HHD3,HHD4,HHD5,HHD6,HHD7,HHD8,HHD9,HHD10,HHD11,HHD12,ETHC1,ETHC2,ETHC3,ETHC4,ETHC5,ETHC6,HVP1,HVP2,HVP3,HVP4,HVP5,HVP6,HUR1,HUR2,RHP1,RHP2,RHP3,RHP4,HUPA1,HUPA2,HUPA3,HUPA4,HUPA5,HUPA6,HUPA7,RP1,RP2,RP3,RP4,MSA,ADI,DMA,IC1,IC2,IC3,IC4,IC5,IC6,IC7,IC8,IC9,IC10,IC11,IC12,IC13,IC14,IC15,IC16,IC17,IC18,IC19,IC20,IC21,IC22,IC23,HHAS1,HHAS2,HHAS3,HHAS4,MC1,MC2,MC3,TPE1,TPE2,TPE3,TPE4,TPE5,TPE6,TPE7,TPE8,TPE9,PEC1,PEC2,TPE10,TPE11,TPE12,TPE13,LFC1,LFC2,LFC3,LFC4,LFC5,LFC6,LFC7,LFC8,LFC9,LFC10,OCC1,OCC2,OCC3,OCC4,OCC5,OCC6,OCC7,OCC8,OCC9,OCC10,OCC11,OCC12,OCC13,EIC1,EIC2,EIC3,EIC4,EIC5,EIC6,EIC7,EIC8,EIC9,EIC10,EIC11,EIC12,EIC13,EIC14,EIC15,EIC16,OEDC1,OEDC2,OEDC3,OEDC4,OEDC5,OEDC6,OEDC7,EC1,EC2,EC3,EC4,EC5,EC6,EC7,EC8,SEC1,SEC2,SEC3,SEC4,SEC5,AFC1,AFC2,AFC3,AFC4,AFC5,AFC6,VC1,VC2,VC3,VC4,ANC1,ANC2,ANC3,ANC4,ANC5,ANC6,ANC7,ANC8,ANC9,ANC10,ANC11,ANC12,ANC13,ANC14,ANC15,POBC1,POBC2,LSC1,LSC2,LSC3,LSC4,VOC1,VOC2,VOC3,HC1,HC2,HC3,HC4,HC5,HC6,HC7,HC8,HC9,HC10,HC11,HC12,HC13,HC14,HC15,HC16,HC17,HC18,HC19,HC20,HC21,MHUC1,MHUC2,AC1,AC2,CARDPROM,MAXADATE,NUMPROM,CARDPM12,NUMPRM12,RAMNTALL,NGIFTALL,CARDGIFT,MINRAMNT,MINRDATE,MAXRAMNT,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,NEXTDATE,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,OSOURCE,STATE,ZIP,MAILCODE,NOEXCH,MDMAUD,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,TARGET_B,TARGET_D
0,9401.0,1.0,5202.0,46.0,6.0,9.0,16.0,0.0,15.0,55.0,11.0,6.0,2.0,1.0,9.0,3611.0,940.0,998.0,99.0,0.0,0.0,50.0,50.0,67.0,0.0,0.0,31.0,6.0,4.0,2.0,6.0,4.0,14.0,0.0,0.0,2.0,0.0,1.0,4.0,34.0,41.0,43.0,32.0,42.0,45.0,32.0,33.0,46.0,21.0,13.0,14.0,33.0,23.0,10.0,4.0,2.0,11.0,16.0,36.0,22.0,15.0,12.0,1.0,5.0,4.0,21.0,75.0,55.0,23.0,9.0,69.0,4.0,3.0,24.0,317.0,360.0,99.0,99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5468.0,5218.0,12.0,10.0,96.0,4.0,97.0,3.0,9.0,59.0,94.0,88.0,55.0,95.0,5.0,4.0,1.0,3.0,5.0,4.0,2.0,18.0,44.0,5.0,0.0,0.0,0.0,97.0,98.0,98.0,98.0,99.0,94.0,0.0,83.0,76.0,73.0,21.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,91.0,91.0,91.0,94.0,4480.0,13.0,803.0,1088.0,1096.0,1026.0,1037.0,36175.0,2.0,6.0,2.0,5.0,15.0,14.0,13.0,10.0,33.0,2.0,5.0,2.0,5.0,15.0,14.0,14.0,10.0,32.0,6.0,2.0,66.0,3.0,56.0,44.0,9.0,80.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,2.0,24.0,32.0,12.0,71.0,70.0,83.0,58.0,81.0,57.0,64.0,57.0,99.0,99.0,0.0,22.0,24.0,4.0,21.0,13.0,2.0,1.0,6.0,0.0,4.0,1.0,0.0,3.0,1.0,0.0,6.0,13.0,1.0,2.0,8.0,18.0,11.0,4.0,3.0,4.0,10.0,7.0,11.0,1.0,6.0,2.0,1.0,16.0,69.0,5.0,2.0,160.0,5.0,5.0,12.0,21.0,7.0,30.0,20.0,14.0,24.0,4.0,24.0,10.0,0.0,0.0,0.0,8.0,15.0,0.0,55.0,10.0,11.0,0.0,0.0,2.0,0.0,3.0,1.0,1.0,2.0,3.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,42.0,39.0,50.0,7.0,27.0,16.0,99.0,92.0,53.0,5.0,10.0,2.0,26.0,56.0,97.0,99.0,0.0,0.0,0.0,96.0,0.0,4.0,0.0,0.0,0.0,99.0,0.0,99.0,99.0,99.0,20.0,4.0,6.0,5.0,12.0,9702.0,32.0,6.0,13.0,47.0,3.0,1.0,10.0,9310.0,25.0,9512.0,25.0,9512.0,9310.0,9504.0,18.0,15.666667,148535.0,0.0,2.0,1.0,BOA,CA,91326,A,0,XXXX,14.0,H,M,3.0,L,G,A,S,1.0,0,0.0
2,8701.0,0.0,2801.0,70.0,1.0,4.0,2.0,0.0,23.0,14.0,31.0,3.0,0.0,3.0,0.0,640.0,160.0,219.0,0.0,8.0,92.0,54.0,46.0,61.0,0.0,0.0,11.0,32.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,31.0,0.0,0.0,1.0,32.0,40.0,44.0,34.0,43.0,47.0,25.0,45.0,35.0,20.0,15.0,25.0,17.0,17.0,12.0,7.0,7.0,20.0,17.0,30.0,14.0,19.0,25.0,11.0,23.0,23.0,27.0,50.0,30.0,15.0,8.0,63.0,9.0,6.0,23.0,199.0,283.0,85.0,83.0,3.0,4.0,1.0,0.0,2.0,0.0,2.0,1000.0,1263.0,2.0,1.0,48.0,52.0,93.0,7.0,6.0,36.0,73.0,61.0,30.0,84.0,16.0,6.0,3.0,3.0,21.0,12.0,4.0,13.0,36.0,13.0,0.0,0.0,0.0,10.0,25.0,50.0,69.0,92.0,10.0,15.0,42.0,55.0,50.0,15.0,5.0,4.0,0.0,9.0,42.0,4.0,0.0,5.0,1.0,8.0,17.0,34.0,9340.0,67.0,862.0,386.0,388.0,396.0,423.0,15130.0,27.0,12.0,4.0,26.0,22.0,5.0,0.0,0.0,4.0,35.0,5.0,6.0,12.0,30.0,6.0,0.0,0.0,5.0,22.0,14.0,26.0,20.0,46.0,54.0,3.0,58.0,36.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,17.0,13.0,15.0,0.0,43.0,69.0,81.0,53.0,68.0,45.0,33.0,31.0,0.0,99.0,23.0,17.0,3.0,0.0,6.0,6.0,0.0,0.0,13.0,42.0,12.0,0.0,0.0,0.0,42.0,0.0,6.0,3.0,0.0,0.0,0.0,23.0,3.0,3.0,6.0,0.0,3.0,3.0,3.0,3.0,3.0,0.0,3.0,6.0,87.0,0.0,0.0,120.0,28.0,12.0,14.0,27.0,10.0,3.0,5.0,0.0,19.0,1.0,17.0,0.0,0.0,0.0,0.0,13.0,23.0,0.0,14.0,40.0,31.0,16.0,0.0,1.0,0.0,13.0,0.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,29.0,67.0,56.0,41.0,3.0,0.0,94.0,43.0,27.0,4.0,38.0,0.0,10.0,19.0,39.0,45.0,55.0,0.0,0.0,45.0,22.0,17.0,0.0,0.0,16.0,23.0,77.0,22.0,93.0,89.0,16.0,2.0,6.0,6.0,27.0,9702.0,66.0,6.0,14.0,109.0,16.0,7.0,2.0,8711.0,11.0,9411.0,10.0,9512.0,8702.0,8711.0,9.0,6.8125,172556.0,1.0,4.0,41.0,BRY,CA,95953,A,0,XXXX,44.0,U,F,3.0,L,E,C,R,2.0,0,0.0
4,8701.0,0.0,6001.0,38.0,4.0,6.0,0.0,0.0,33.0,36.0,34.0,7.0,1.0,1.0,3.0,1067.0,245.0,348.0,0.0,99.0,0.0,46.0,54.0,99.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,35.0,44.0,49.0,40.0,50.0,54.0,25.0,36.0,41.0,23.0,12.0,20.0,18.0,10.0,8.0,10.0,22.0,15.0,14.0,35.0,23.0,13.0,24.0,12.0,22.0,25.0,31.0,45.0,27.0,11.0,4.0,55.0,8.0,18.0,19.0,182.0,263.0,75.0,73.0,14.0,23.0,8.0,3.0,14.0,14.0,0.0,484.0,519.0,3.0,3.0,64.0,36.0,95.0,5.0,0.0,41.0,70.0,61.0,35.0,74.0,26.0,5.0,2.0,3.0,13.0,20.0,5.0,21.0,51.0,27.0,0.0,0.0,0.0,0.0,1.0,1.0,11.0,46.0,0.0,1.0,48.0,54.0,54.0,14.0,5.0,22.0,1.0,1.0,15.0,17.0,3.0,1.0,0.0,9.0,50.0,88.0,0.0,91.0,509.0,251.0,284.0,278.0,311.0,10717.0,18.0,31.0,25.0,20.0,4.0,1.0,0.0,0.0,0.0,10.0,29.0,27.0,27.0,6.0,2.0,0.0,0.0,0.0,31.0,6.0,40.0,3.0,55.0,45.0,14.0,87.0,10.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,3.0,18.0,9.0,14.0,1.0,21.0,54.0,65.0,46.0,61.0,45.0,70.0,53.0,99.0,99.0,0.0,6.0,8.0,3.0,14.0,19.0,0.0,0.0,12.0,0.0,15.0,19.0,3.0,1.0,0.0,0.0,5.0,39.0,0.0,1.0,9.0,13.0,5.0,3.0,0.0,3.0,9.0,8.0,4.0,0.0,10.0,0.0,0.0,4.0,82.0,5.0,0.0,120.0,9.0,17.0,44.0,17.0,7.0,3.0,3.0,4.0,13.0,2.0,11.0,4.0,1.0,1.0,1.0,14.0,31.0,0.0,31.0,15.0,48.0,11.0,1.0,3.0,0.0,18.0,0.0,1.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,77.0,99.0,0.0,1.0,1.0,99.0,56.0,17.0,5.0,35.0,12.0,14.0,19.0,30.0,45.0,55.0,3.0,6.0,59.0,2.0,39.0,0.0,0.0,0.0,99.0,0.0,97.0,99.0,94.0,5.0,2.0,3.0,4.0,26.0,9702.0,63.0,4.0,9.0,107.0,14.0,8.0,3.0,8705.0,12.0,9410.0,11.0,9504.0,8701.0,8705.0,4.0,7.642857,62117.0,1.0,1.0,53.0,DRK,IN,46755,A,0,XXXX,40.0,H,F,3.0,L,E,D,T,2.0,0,0.0
9,8901.0,0.0,2603.0,72.0,4.0,2.0,16.0,2.0,30.0,31.0,33.0,9.0,5.0,6.0,0.0,1435.0,384.0,483.0,0.0,0.0,99.0,51.0,49.0,99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31.0,42.0,45.0,34.0,46.0,49.0,32.0,35.0,45.0,19.0,12.0,22.0,20.0,13.0,13.0,11.0,9.0,13.0,16.0,34.0,22.0,15.0,27.0,9.0,25.0,17.0,34.0,49.0,35.0,18.0,7.0,65.0,6.0,6.0,23.0,198.0,296.0,81.0,80.0,0.0,3.0,2.0,2.0,0.0,0.0,0.0,388.0,428.0,2.0,1.0,85.0,15.0,91.0,9.0,28.0,43.0,80.0,69.0,38.0,91.0,9.0,6.0,3.0,3.0,12.0,13.0,4.0,27.0,56.0,17.0,0.0,0.0,0.0,0.0,0.0,1.0,8.0,34.0,0.0,1.0,54.0,56.0,57.0,15.0,5.0,1.0,2.0,15.0,10.0,0.0,2.0,2.0,0.0,2.0,7.0,48.0,0.0,107.0,613.0,225.0,267.0,267.0,305.0,9587.0,31.0,23.0,21.0,15.0,8.0,1.0,1.0,0.0,0.0,22.0,23.0,24.0,18.0,11.0,1.0,1.0,0.0,0.0,34.0,8.0,47.0,13.0,30.0,70.0,1.0,64.0,15.0,0.0,0.0,0.0,0.0,0.0,6.0,16.0,0.0,11.0,14.0,21.0,6.0,52.0,65.0,72.0,58.0,68.0,53.0,71.0,71.0,76.0,92.0,4.0,9.0,6.0,2.0,9.0,12.0,0.0,0.0,12.0,17.0,15.0,8.0,6.0,4.0,18.0,0.0,8.0,16.0,4.0,1.0,2.0,16.0,2.0,4.0,2.0,1.0,7.0,7.0,4.0,7.0,9.0,4.0,6.0,17.0,54.0,7.0,2.0,120.0,20.0,10.0,40.0,12.0,10.0,6.0,1.0,2.0,26.0,1.0,24.0,3.0,1.0,2.0,0.0,16.0,30.0,1.0,31.0,20.0,33.0,14.0,0.0,1.0,1.0,21.0,0.0,0.0,1.0,0.0,2.0,11.0,0.0,0.0,0.0,2.0,0.0,0.0,89.0,97.0,1.0,0.0,2.0,96.0,66.0,23.0,13.0,34.0,0.0,6.0,12.0,37.0,47.0,53.0,0.0,0.0,1.0,30.0,6.0,40.0,0.0,23.0,37.0,62.0,40.0,98.0,95.0,7.0,2.0,6.0,7.0,21.0,9702.0,55.0,6.0,12.0,48.0,9.0,5.0,4.0,9507.0,7.0,9102.0,6.0,9601.0,8905.0,8910.0,5.0,5.333333,85548.0,1.0,4.0,57.0,ENQ,other,56475,A,0,XXXX,51.0,H,F,3.0,L,D,D,R,3.0,0,0.0
10,9301.0,1.0,2709.0,70.0,4.0,6.0,17.0,0.0,19.0,8.0,76.0,9.0,11.0,3.0,6.0,880.0,190.0,229.0,0.0,99.0,0.0,51.0,49.0,98.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,21.0,38.0,46.0,29.0,41.0,49.0,39.0,32.0,47.0,22.0,26.0,15.0,19.0,14.0,11.0,10.0,5.0,12.0,14.0,35.0,23.0,16.0,24.0,7.0,23.0,10.0,29.0,61.0,55.0,36.0,20.0,61.0,3.0,3.0,32.0,325.0,374.0,83.0,81.0,7.0,16.0,9.0,9.0,0.0,0.0,0.0,766.0,800.0,3.0,3.0,79.0,21.0,94.0,6.0,7.0,48.0,83.0,77.0,44.0,89.0,11.0,4.0,0.0,4.0,6.0,13.0,7.0,33.0,53.0,13.0,0.0,0.0,0.0,0.0,0.0,12.0,53.0,98.0,0.0,2.0,75.0,76.0,69.0,17.0,5.0,11.0,5.0,0.0,6.0,5.0,9.0,0.0,19.0,33.0,52.0,81.0,0.0,291.0,770.0,356.0,373.0,342.0,370.0,8424.0,12.0,17.0,19.0,30.0,22.0,0.0,0.0,0.0,0.0,5.0,9.0,30.0,33.0,22.0,0.0,0.0,0.0,0.0,27.0,0.0,43.0,11.0,42.0,58.0,13.0,82.0,7.0,0.0,0.0,0.0,0.0,2.0,7.0,2.0,0.0,5.0,8.0,12.0,2.0,15.0,60.0,69.0,48.0,69.0,45.0,72.0,56.0,99.0,0.0,0.0,19.0,6.0,4.0,19.0,12.0,0.0,4.0,18.0,3.0,5.0,0.0,9.0,2.0,5.0,1.0,2.0,4.0,3.0,6.0,6.0,22.0,5.0,3.0,9.0,4.0,3.0,17.0,5.0,7.0,9.0,11.0,3.0,9.0,61.0,5.0,1.0,154.0,0.0,4.0,14.0,26.0,9.0,31.0,15.0,0.0,48.0,2.0,25.0,20.0,0.0,0.0,0.0,11.0,19.0,0.0,8.0,31.0,76.0,0.0,0.0,49.0,0.0,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,81.0,96.0,1.0,2.0,1.0,99.0,88.0,41.0,12.0,17.0,6.0,14.0,30.0,53.0,74.0,26.0,0.0,0.0,37.0,0.0,15.0,38.0,0.0,10.0,99.0,0.0,99.0,99.0,96.0,6.0,2.0,4.0,8.0,18.0,9702.0,41.0,6.0,11.0,74.0,7.0,3.0,5.0,9306.0,15.0,9412.0,15.0,9506.0,9302.0,9306.0,4.0,10.571429,134891.0,1.0,1.0,37.0,USB,other,84720,A,0,XXXX,35.0,H,M,3.0,L,F,D,T,1.0,0,0.0


In [22]:
data.reset_index(drop=True, inplace=True)

In [23]:
data.shape

(25251, 340)

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)

numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(np.object)

display(categoricalX.head())

for col in categoricalX.columns:
    print(col, categoricalX[col].nunique())

print()
# We're going to drop columns "OSOURCE" and "ZIP" because they too many levels.
categoricalX.drop(columns = ['OSOURCE','ZIP'], axis = 1, inplace = True)
# we OneHotEncode the categoricals so we can use the same dataset to perform a regression later (in the lab).
# even though it is not needed for a DecisionTree or RandomForest model

encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
X = pd.concat([numericalX, encoded_categorical], axis = 1)

# Note: we need to do train/test split before downsampling, and then only downsample the training set - Why?

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


Unnamed: 0,OSOURCE,STATE,ZIP,MAILCODE,NOEXCH,MDMAUD,HOMEOWNR,GENDER,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A
0,BOA,CA,91326,A,0,XXXX,H,M,L,G,A,S
1,BRY,CA,95953,A,0,XXXX,U,F,L,E,C,R
2,DRK,IN,46755,A,0,XXXX,H,F,L,E,D,T
3,ENQ,other,56475,A,0,XXXX,H,F,L,D,D,R
4,USB,other,84720,A,0,XXXX,H,M,L,F,D,T


OSOURCE 580
STATE 12
ZIP 9098
MAILCODE 2
NOEXCH 3
MDMAUD 17
HOMEOWNR 2
GENDER 3
RFA_2R 1
RFA_2A 4
GEOCODE2 4
DOMAIN_A 5



In [25]:
data['TARGET_B'].value_counts()

0    23987
1     1264
Name: TARGET_B, dtype: int64

In [27]:
# for downsampling we need to temporarily concat X_train and y_train
trainset = pd.concat([X_train, y_train], axis=1)

# Manual way to downsample category 0:
category_0 = trainset[trainset['TARGET_B']==0].sample(len(trainset[trainset['TARGET_B']==1]))
print(category_0.shape)

category_1 = trainset[trainset['TARGET_B']== 1 ]
trainset_new = pd.concat([category_0, category_1], axis = 0)
trainset_new = trainset_new.sample(frac=1) #randomize the rows 
X_train = trainset_new.drop(['TARGET_B'], axis=1)
y_train = trainset_new['TARGET_B']
#data = data.reset_index(drop=True)
print(X_train.shape)

(1026, 371)
(2052, 370)


In [28]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

# Now we can remove the column target d from the set of features
X_train = X_train.drop(['TARGET_D'], axis = 1)
X_test = X_test.drop(['TARGET_D'], axis = 1)

In [29]:
y_test

16131    0
14503    0
19703    0
7849     0
13285    1
        ..
17316    0
249      0
18510    0
7449     0
7485     0
Name: TARGET_B, Length: 5051, dtype: int64

In [30]:
y_test_regression

16131    0.0
14503    0.0
19703    0.0
7849     0.0
13285    6.0
        ... 
17316    0.0
249      0.0
18510    0.0
7449     0.0
7485     0.0
Name: TARGET_D, Length: 5051, dtype: float64

In [31]:
X_train

Unnamed: 0,ODATEDW,TCODE,DOB,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,FEDGOV,WEALTH2,POP901,POP902,POP903,POP90C1,POP90C2,POP90C3,POP90C4,POP90C5,ETH1,ETH2,ETH3,ETH4,ETH5,ETH6,ETH7,ETH8,ETH9,ETH10,ETH11,ETH12,ETH13,ETH14,ETH15,ETH16,AGE901,AGE902,AGE903,AGE904,AGE905,AGE906,AGE907,CHIL1,CHIL2,CHIL3,AGEC1,AGEC2,AGEC3,AGEC4,AGEC5,AGEC6,AGEC7,CHILC1,CHILC2,CHILC3,CHILC4,CHILC5,HHAGE1,HHAGE2,HHAGE3,HHN1,HHN2,HHN3,HHN4,HHN5,HHN6,MARR1,MARR2,MARR3,MARR4,HHP1,HHP2,DW1,DW2,DW3,DW4,DW5,DW6,DW7,DW8,DW9,HV1,HV2,HV3,HV4,HU1,HU2,HU3,HU4,HU5,HHD1,HHD2,HHD3,HHD4,HHD5,HHD6,HHD7,HHD8,HHD9,HHD10,HHD11,HHD12,ETHC1,ETHC2,ETHC3,ETHC4,ETHC5,ETHC6,HVP1,HVP2,HVP3,HVP4,HVP5,HVP6,HUR1,HUR2,RHP1,RHP2,RHP3,RHP4,HUPA1,HUPA2,HUPA3,HUPA4,HUPA5,HUPA6,HUPA7,RP1,RP2,RP3,RP4,MSA,ADI,DMA,IC1,IC2,IC3,IC4,IC5,IC6,IC7,IC8,IC9,IC10,IC11,IC12,IC13,IC14,IC15,IC16,IC17,IC18,IC19,IC20,IC21,IC22,IC23,HHAS1,HHAS2,HHAS3,HHAS4,MC1,MC2,MC3,TPE1,TPE2,TPE3,TPE4,TPE5,TPE6,TPE7,TPE8,TPE9,PEC1,PEC2,TPE10,TPE11,TPE12,TPE13,LFC1,LFC2,LFC3,LFC4,LFC5,LFC6,LFC7,LFC8,LFC9,LFC10,OCC1,OCC2,OCC3,OCC4,OCC5,OCC6,OCC7,OCC8,OCC9,OCC10,OCC11,OCC12,OCC13,EIC1,EIC2,EIC3,EIC4,EIC5,EIC6,EIC7,EIC8,EIC9,EIC10,EIC11,EIC12,EIC13,EIC14,EIC15,EIC16,OEDC1,OEDC2,OEDC3,OEDC4,OEDC5,OEDC6,OEDC7,EC1,EC2,EC3,EC4,EC5,EC6,EC7,EC8,SEC1,SEC2,SEC3,SEC4,SEC5,AFC1,AFC2,AFC3,AFC4,AFC5,AFC6,VC1,VC2,VC3,VC4,ANC1,ANC2,ANC3,ANC4,ANC5,ANC6,ANC7,ANC8,ANC9,ANC10,ANC11,ANC12,ANC13,ANC14,ANC15,POBC1,POBC2,LSC1,LSC2,LSC3,LSC4,VOC1,VOC2,VOC3,HC1,HC2,HC3,HC4,HC5,HC6,HC7,HC8,HC9,HC10,HC11,HC12,HC13,HC14,HC15,HC16,HC17,HC18,HC19,HC20,HC21,MHUC1,MHUC2,AC1,AC2,CARDPROM,MAXADATE,NUMPROM,CARDPM12,NUMPRM12,RAMNTALL,NGIFTALL,CARDGIFT,MINRAMNT,MINRDATE,MAXRAMNT,MAXRDATE,LASTGIFT,LASTDATE,FISTDATE,NEXTDATE,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,CLUSTER,DATASRCE,DOMAIN_B,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42
21021,9401.0,28.0,6001.0,38.0,4.0,8.0,1.0,0.0,50.0,21.0,45.0,1.0,6.0,1.0,8.0,1784.0,541.0,772.0,99.0,0.0,0.0,45.0,55.0,98.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.0,45.0,51.0,40.0,47.0,52.0,18.0,41.0,38.0,21.0,16.0,17.0,16.0,13.0,16.0,15.0,8.0,17.0,16.0,30.0,21.0,16.0,29.0,9.0,27.0,24.0,44.0,32.0,15.0,5.0,2.0,64.0,9.0,7.0,21.0,159.0,231.0,73.0,71.0,5.0,27.0,22.0,9.0,0.0,0.0,0.0,816.0,943.0,4.0,3.0,67.0,33.0,95.0,5.0,5.0,24.0,70.0,62.0,20.0,83.0,17.0,4.0,0.0,3.0,11.0,23.0,6.0,15.0,59.0,25.0,0.0,0.0,0.0,4.0,11.0,28.0,58.0,91.0,1.0,0.0,61.0,60.0,60.0,13.0,4.0,20.0,6.0,0.0,8.0,17.0,8.0,0.0,5.0,16.0,76.0,95.0,7920.0,427.0,619.0,327.0,416.0,410.0,489.0,19133.0,14.0,22.0,22.0,15.0,15.0,8.0,1.0,0.0,3.0,9.0,17.0,16.0,21.0,21.0,10.0,1.0,0.0,5.0,35.0,2.0,58.0,1.0,44.0,56.0,9.0,83.0,14.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,7.0,17.0,19.0,3.0,62.0,66.0,81.0,53.0,80.0,52.0,66.0,48.0,80.0,0.0,4.0,18.0,21.0,2.0,15.0,16.0,0.0,0.0,7.0,0.0,11.0,4.0,2.0,3.0,0.0,0.0,2.0,15.0,7.0,4.0,7.0,15.0,10.0,4.0,1.0,1.0,11.0,11.0,10.0,1.0,1.0,6.0,1.0,7.0,66.0,17.0,1.0,140.0,2.0,4.0,21.0,33.0,9.0,20.0,11.0,5.0,13.0,0.0,8.0,9.0,0.0,0.0,0.0,23.0,50.0,1.0,21.0,29.0,45.0,4.0,0.0,7.0,1.0,17.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,3.0,67.0,96.0,1.0,1.0,2.0,98.0,61.0,11.0,6.0,23.0,0.0,1.0,7.0,24.0,75.0,25.0,0.0,0.0,88.0,0.0,12.0,0.0,0.0,0.0,99.0,0.0,99.0,99.0,99.0,6.0,2.0,11.0,7.0,14.0,9702.0,36.0,5.0,11.0,44.0,4.0,2.0,5.0,9310.0,15.0,9512.0,15.0,9512.0,9310.0,9403.0,5.0,11.000000,104158.0,0.0,1.0,32.0,11.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
15166,9401.0,1.0,3910.0,58.0,5.0,8.0,6.0,0.0,20.0,52.0,6.0,8.0,2.0,1.0,8.0,1038.0,286.0,334.0,0.0,0.0,99.0,52.0,48.0,99.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,32.0,41.0,44.0,33.0,44.0,47.0,32.0,35.0,43.0,22.0,11.0,20.0,26.0,19.0,11.0,9.0,5.0,12.0,17.0,35.0,20.0,16.0,19.0,6.0,18.0,13.0,30.0,57.0,38.0,19.0,7.0,69.0,6.0,4.0,21.0,237.0,310.0,95.0,94.0,2.0,3.0,1.0,1.0,0.0,0.0,0.0,1472.0,1513.0,3.0,1.0,77.0,23.0,97.0,3.0,25.0,48.0,86.0,77.0,42.0,93.0,7.0,5.0,3.0,3.0,11.0,7.0,1.0,26.0,61.0,13.0,0.0,0.0,0.0,18.0,48.0,79.0,90.0,96.0,2.0,1.0,82.0,71.0,70.0,16.0,4.0,3.0,0.0,1.0,20.0,2.0,1.0,0.0,13.0,29.0,52.0,71.0,1600.0,51.0,602.0,515.0,534.0,557.0,582.0,16301.0,7.0,11.0,10.0,19.0,33.0,10.0,6.0,4.0,0.0,4.0,10.0,11.0,18.0,35.0,11.0,6.0,5.0,0.0,10.0,0.0,59.0,6.0,38.0,62.0,9.0,80.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,14.0,0.0,48.0,22.0,26.0,4.0,65.0,78.0,87.0,70.0,87.0,70.0,63.0,48.0,99.0,99.0,0.0,18.0,17.0,0.0,3.0,15.0,1.0,4.0,9.0,8.0,18.0,1.0,2.0,2.0,11.0,0.0,8.0,15.0,2.0,6.0,3.0,11.0,5.0,5.0,5.0,0.0,9.0,6.0,8.0,6.0,8.0,2.0,1.0,10.0,71.0,6.0,1.0,136.0,0.0,9.0,30.0,27.0,13.0,13.0,8.0,8.0,28.0,3.0,28.0,5.0,0.0,0.0,0.0,10.0,20.0,0.0,52.0,28.0,6.0,0.0,0.0,6.0,1.0,11.0,1.0,0.0,1.0,0.0,5.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,82.0,98.0,1.0,0.0,1.0,95.0,90.0,32.0,7.0,22.0,3.0,13.0,18.0,64.0,69.0,31.0,0.0,0.0,20.0,31.0,24.0,20.0,0.0,6.0,3.0,97.0,1.0,98.0,98.0,9.0,3.0,5.0,2.0,17.0,9702.0,40.0,6.0,13.0,76.0,4.0,1.0,10.0,9310.0,25.0,9410.0,25.0,9512.0,9310.0,9404.0,6.0,19.000000,92653.0,1.0,1.0,15.0,35.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5383,8601.0,2.0,2303.0,75.0,4.0,7.0,10.0,0.0,36.0,6.0,69.0,15.0,0.0,0.0,5.0,888.0,271.0,347.0,99.0,0.0,0.0,50.0,50.0,99.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,48.0,54.0,56.0,46.0,51.0,54.0,13.0,36.0,42.0,22.0,9.0,18.0,13.0,11.0,19.0,18.0,13.0,15.0,14.0,35.0,20.0,16.0,46.0,11.0,42.0,21.0,34.0,45.0,23.0,7.0,2.0,59.0,5.0,11.0,26.0,185.0,256.0,98.0,97.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1288.0,1322.0,4.0,3.0,97.0,3.0,98.0,2.0,0.0,19.0,78.0,64.0,17.0,91.0,9.0,2.0,1.0,1.0,10.0,24.0,1.0,11.0,53.0,36.0,0.0,0.0,0.0,5.0,25.0,82.0,96.0,99.0,0.0,0.0,56.0,54.0,58.0,14.0,4.0,2.0,0.0,0.0,3.0,0.0,0.0,0.0,20.0,50.0,60.0,80.0,1600.0,51.0,602.0,370.0,502.0,450.0,507.0,17958.0,17.0,10.0,19.0,11.0,23.0,19.0,0.0,0.0,0.0,11.0,7.0,20.0,12.0,27.0,23.0,0.0,0.0,0.0,49.0,2.0,63.0,1.0,25.0,75.0,3.0,74.0,14.0,10.0,4.0,6.0,0.0,0.0,3.0,0.0,0.0,7.0,23.0,28.0,10.0,66.0,57.0,64.0,50.0,60.0,47.0,75.0,52.0,0.0,99.0,0.0,7.0,18.0,4.0,7.0,16.0,0.0,15.0,4.0,0.0,20.0,5.0,2.0,1.0,0.0,0.0,10.0,23.0,2.0,2.0,2.0,12.0,15.0,7.0,4.0,0.0,4.0,0.0,6.0,13.0,15.0,0.0,0.0,9.0,76.0,0.0,0.0,120.0,20.0,8.0,34.0,22.0,5.0,7.0,5.0,7.0,3.0,0.0,7.0,4.0,0.0,0.0,0.0,19.0,36.0,0.0,6.0,16.0,69.0,5.0,0.0,0.0,0.0,6.0,3.0,0.0,5.0,8.0,0.0,28.0,0.0,0.0,0.0,1.0,3.0,20.0,72.0,75.0,0.0,0.0,25.0,93.0,45.0,14.0,22.0,35.0,0.0,0.0,0.0,3.0,34.0,66.0,0.0,0.0,99.0,0.0,0.0,0.0,0.0,0.0,99.0,0.0,99.0,98.0,99.0,9.0,3.0,11.0,15.0,30.0,9702.0,71.0,6.0,14.0,123.0,15.0,10.0,3.0,8703.0,25.0,9512.0,14.0,9602.0,8703.0,8804.0,13.0,8.200000,94790.0,1.0,3.0,29.0,5.0,3.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
15938,9301.0,0.0,5904.0,39.0,4.0,4.0,2.0,0.0,17.0,9.0,23.0,7.0,6.0,3.0,4.0,1304.0,317.0,629.0,99.0,0.0,0.0,48.0,52.0,69.0,16.0,0.0,13.0,5.0,6.0,0.0,1.0,3.0,2.0,0.0,0.0,3.0,0.0,0.0,1.0,32.0,36.0,39.0,36.0,42.0,45.0,17.0,47.0,37.0,15.0,13.0,33.0,17.0,12.0,10.0,8.0,6.0,21.0,17.0,37.0,12.0,13.0,19.0,10.0,18.0,42.0,34.0,24.0,14.0,6.0,3.0,47.0,13.0,7.0,34.0,125.0,205.0,22.0,21.0,13.0,77.0,64.0,59.0,1.0,0.0,1.0,924.0,965.0,5.0,5.0,59.0,41.0,98.0,2.0,0.0,20.0,50.0,38.0,15.0,69.0,31.0,5.0,1.0,4.0,25.0,32.0,8.0,8.0,47.0,14.0,3.0,12.0,1.0,0.0,4.0,35.0,84.0,97.0,0.0,5.0,20.0,42.0,44.0,11.0,5.0,26.0,51.0,0.0,1.0,12.0,28.0,0.0,30.0,81.0,94.0,97.0,1600.0,51.0,602.0,320.0,359.0,365.0,419.0,18304.0,17.0,20.0,23.0,20.0,10.0,7.0,2.0,1.0,0.0,6.0,17.0,22.0,28.0,17.0,10.0,0.0,0.0,0.0,19.0,0.0,56.0,8.0,56.0,44.0,13.0,66.0,11.0,15.0,2.0,13.0,0.0,0.0,5.0,3.0,2.0,14.0,21.0,27.0,7.0,66.0,74.0,79.0,69.0,79.0,69.0,86.0,48.0,99.0,0.0,0.0,22.0,18.0,8.0,5.0,19.0,0.0,3.0,12.0,1.0,8.0,3.0,1.0,0.0,0.0,0.0,1.0,12.0,3.0,1.0,4.0,17.0,18.0,6.0,3.0,0.0,13.0,9.0,9.0,4.0,7.0,6.0,3.0,5.0,69.0,10.0,0.0,142.0,5.0,10.0,17.0,17.0,8.0,21.0,21.0,12.0,14.0,2.0,7.0,16.0,0.0,0.0,0.0,10.0,17.0,4.0,9.0,20.0,23.0,48.0,2.0,4.0,0.0,8.0,1.0,1.0,9.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,17.0,62.0,84.0,1.0,7.0,8.0,91.0,35.0,2.0,4.0,29.0,0.0,0.0,0.0,52.0,60.0,40.0,26.0,20.0,40.0,1.0,57.0,1.0,0.0,0.0,99.0,0.0,99.0,99.0,99.0,7.0,3.0,5.0,5.0,17.0,9702.0,41.0,6.0,13.0,58.0,5.0,4.0,3.0,9301.0,20.0,9510.0,20.0,9510.0,9301.0,9310.0,9.0,11.600000,89734.0,0.0,1.0,27.0,3.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6762,8901.0,2.0,5211.0,45.0,2.0,7.0,22.0,0.0,42.0,18.0,63.0,8.0,39.0,1.0,7.0,1370.0,407.0,619.0,99.0,0.0,0.0,46.0,54.0,98.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,39.0,45.0,47.0,42.0,49.0,51.0,18.0,51.0,39.0,11.0,6.0,17.0,26.0,10.0,15.0,17.0,8.0,20.0,23.0,36.0,14.0,7.0,33.0,12.0,30.0,27.0,44.0,29.0,15.0,4.0,1.0,61.0,13.0,9.0,17.0,152.0,221.0,95.0,94.0,3.0,4.0,1.0,0.0,0.0,0.0,0.0,656.0,671.0,4.0,4.0,83.0,17.0,96.0,4.0,8.0,22.0,66.0,54.0,17.0,80.0,20.0,5.0,1.0,5.0,12.0,29.0,7.0,16.0,53.0,29.0,0.0,1.0,0.0,0.0,0.0,5.0,27.0,87.0,0.0,1.0,64.0,56.0,59.0,13.0,4.0,4.0,0.0,0.0,13.0,3.0,0.0,0.0,22.0,42.0,72.0,87.0,8240.0,413.0,530.0,323.0,421.0,437.0,505.0,19630.0,8.0,23.0,20.0,18.0,19.0,7.0,3.0,1.0,1.0,6.0,12.0,26.0,11.0,27.0,8.0,5.0,2.0,2.0,42.0,1.0,65.0,1.0,39.0,61.0,5.0,94.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,13.0,16.0,1.0,36.0,64.0,72.0,57.0,72.0,57.0,71.0,47.0,99.0,99.0,0.0,32.0,23.0,4.0,7.0,19.0,0.0,0.0,8.0,1.0,5.0,0.0,0.0,1.0,0.0,0.0,3.0,6.0,1.0,3.0,2.0,13.0,6.0,2.0,1.0,4.0,5.0,15.0,10.0,29.0,8.0,39.0,1.0,8.0,35.0,9.0,0.0,142.0,3.0,9.0,20.0,20.0,7.0,17.0,24.0,2.0,20.0,2.0,11.0,9.0,0.0,0.0,0.0,22.0,42.0,5.0,18.0,29.0,63.0,7.0,2.0,16.0,1.0,6.0,0.0,0.0,4.0,2.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,1.0,40.0,98.0,2.0,0.0,0.0,99.0,63.0,16.0,12.0,34.0,0.0,1.0,3.0,6.0,25.0,75.0,0.0,0.0,35.0,6.0,37.0,20.0,0.0,1.0,99.0,0.0,99.0,99.0,99.0,5.0,2.0,7.0,9.0,26.0,9702.0,70.0,5.0,12.0,126.0,20.0,10.0,3.0,9211.0,9.0,9202.0,8.0,9602.0,8909.0,9003.0,6.0,6.300000,33311.0,1.0,3.0,32.0,24.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4500,8801.0,1.0,1901.0,79.0,6.0,6.0,1.0,4.0,23.0,24.0,36.0,3.0,2.0,1.0,7.0,2879.0,812.0,977.0,95.0,0.0,5.0,51.0,49.0,88.0,1.0,0.0,8.0,11.0,0.0,0.0,1.0,4.0,0.0,1.0,0.0,9.0,0.0,0.0,2.0,32.0,39.0,42.0,34.0,44.0,47.0,27.0,49.0,38.0,13.0,10.0,27.0,22.0,14.0,10.0,10.0,7.0,22.0,20.0,32.0,17.0,9.0,23.0,5.0,19.0,13.0,36.0,51.0,32.0,14.0,5.0,69.0,9.0,4.0,17.0,207.0,293.0,81.0,79.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2067.0,2157.0,7.0,7.0,87.0,13.0,96.0,4.0,0.0,41.0,83.0,73.0,35.0,89.0,11.0,6.0,1.0,4.0,10.0,11.0,4.0,20.0,52.0,15.0,1.0,1.0,0.0,54.0,87.0,98.0,99.0,99.0,7.0,1.0,63.0,61.0,61.0,15.0,5.0,1.0,0.0,17.0,11.0,1.0,0.0,1.0,71.0,85.0,93.0,94.0,7320.0,15.0,825.0,486.0,499.0,511.0,522.0,17380.0,8.0,11.0,12.0,22.0,28.0,14.0,4.0,1.0,0.0,9.0,8.0,14.0,20.0,31.0,15.0,2.0,1.0,0.0,23.0,2.0,51.0,3.0,67.0,33.0,13.0,78.0,16.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,1.0,9.0,21.0,30.0,9.0,63.0,69.0,77.0,61.0,74.0,58.0,63.0,56.0,99.0,99.0,0.0,18.0,12.0,5.0,21.0,12.0,0.0,3.0,5.0,1.0,15.0,2.0,4.0,1.0,2.0,0.0,13.0,21.0,5.0,1.0,10.0,13.0,12.0,3.0,3.0,1.0,5.0,5.0,4.0,2.0,3.0,2.0,1.0,15.0,75.0,3.0,1.0,135.0,5.0,8.0,27.0,28.0,7.0,20.0,6.0,4.0,21.0,2.0,15.0,8.0,2.0,4.0,0.0,13.0,23.0,2.0,24.0,31.0,36.0,7.0,4.0,5.0,0.0,7.0,0.0,0.0,4.0,3.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,8.0,43.0,85.0,7.0,4.0,4.0,99.0,78.0,19.0,3.0,11.0,18.0,39.0,49.0,73.0,89.0,11.0,0.0,0.0,82.0,4.0,9.0,1.0,0.0,3.0,99.0,0.0,85.0,99.0,99.0,15.0,2.0,5.0,3.0,25.0,9702.0,65.0,6.0,14.0,345.0,11.0,5.0,5.0,8809.0,35.0,9207.0,35.0,9511.0,8809.0,8903.0,6.0,31.363636,152337.0,1.0,3.0,12.0,27.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16756,8601.0,2.0,4601.0,52.0,6.0,7.0,0.0,0.0,31.0,17.0,45.0,9.0,2.0,2.0,7.0,2365.0,518.0,731.0,99.0,0.0,0.0,48.0,52.0,54.0,8.0,1.0,31.0,16.0,1.0,9.0,3.0,12.0,2.0,1.0,0.0,10.0,2.0,1.0,3.0,35.0,42.0,47.0,39.0,47.0,50.0,19.0,46.0,32.0,21.0,11.0,25.0,16.0,13.0,13.0,9.0,12.0,21.0,21.0,25.0,16.0,17.0,19.0,5.0,14.0,20.0,31.0,49.0,31.0,14.0,6.0,46.0,11.0,13.0,30.0,197.0,283.0,63.0,61.0,0.0,35.0,34.0,33.0,12.0,12.0,0.0,2379.0,2413.0,8.0,8.0,66.0,34.0,93.0,7.0,0.0,33.0,71.0,55.0,26.0,73.0,27.0,7.0,2.0,5.0,17.0,21.0,9.0,8.0,30.0,16.0,1.0,5.0,2.0,84.0,93.0,99.0,99.0,99.0,10.0,8.0,34.0,48.0,47.0,15.0,6.0,2.0,33.0,1.0,9.0,1.0,23.0,0.0,86.0,94.0,98.0,99.0,4480.0,13.0,803.0,474.0,493.0,519.0,549.0,18512.0,9.0,3.0,12.0,32.0,24.0,15.0,4.0,1.0,0.0,2.0,6.0,16.0,29.0,29.0,13.0,5.0,1.0,0.0,28.0,4.0,51.0,1.0,50.0,50.0,10.0,78.0,16.0,2.0,2.0,0.0,0.0,1.0,3.0,1.0,0.0,2.0,21.0,25.0,3.0,71.0,60.0,69.0,53.0,65.0,53.0,71.0,55.0,87.0,99.0,2.0,13.0,16.0,14.0,9.0,25.0,0.0,1.0,5.0,1.0,8.0,4.0,3.0,1.0,2.0,0.0,1.0,28.0,8.0,3.0,5.0,10.0,8.0,7.0,2.0,1.0,13.0,4.0,4.0,4.0,9.0,2.0,2.0,8.0,74.0,4.0,0.0,133.0,8.0,12.0,21.0,26.0,14.0,13.0,5.0,4.0,16.0,0.0,9.0,10.0,0.0,0.0,0.0,13.0,31.0,0.0,17.0,28.0,45.0,15.0,0.0,1.0,0.0,6.0,0.0,0.0,3.0,2.0,0.0,2.0,0.0,1.0,2.0,0.0,0.0,21.0,40.0,75.0,5.0,17.0,3.0,96.0,65.0,29.0,7.0,24.0,0.0,2.0,16.0,27.0,69.0,31.0,7.0,6.0,77.0,1.0,22.0,0.0,0.0,0.0,99.0,0.0,99.0,99.0,99.0,10.0,2.0,8.0,7.0,34.0,9702.0,81.0,6.0,13.0,184.0,19.0,13.0,5.0,9312.0,31.0,9207.0,10.0,9509.0,8609.0,8710.0,13.0,9.684211,145424.0,1.0,2.0,4.0,20.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1904,9201.0,0.0,4509.0,52.0,5.0,9.0,22.0,0.0,35.0,12.0,46.0,7.0,0.0,2.0,9.0,1149.0,404.0,498.0,99.0,0.0,0.0,48.0,52.0,97.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,48.0,50.0,42.0,48.0,50.0,15.0,50.0,34.0,16.0,5.0,20.0,18.0,16.0,22.0,15.0,3.0,26.0,18.0,29.0,15.0,12.0,25.0,3.0,22.0,16.0,54.0,29.0,14.0,3.0,1.0,72.0,8.0,4.0,16.0,162.0,231.0,97.0,45.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,1542.0,2085.0,7.0,7.0,94.0,6.0,92.0,8.0,19.0,22.0,81.0,71.0,18.0,90.0,10.0,4.0,1.0,4.0,9.0,15.0,2.0,13.0,59.0,25.0,0.0,0.0,0.0,47.0,50.0,95.0,99.0,99.0,20.0,2.0,81.0,58.0,62.0,13.0,3.0,1.0,0.0,0.0,4.0,0.0,1.0,0.0,79.0,79.0,90.0,90.0,1600.0,51.0,602.0,601.0,635.0,724.0,764.0,35876.0,6.0,4.0,9.0,15.0,28.0,14.0,9.0,3.0,12.0,2.0,4.0,9.0,15.0,28.0,15.0,10.0,3.0,14.0,20.0,3.0,60.0,2.0,38.0,62.0,6.0,82.0,5.0,3.0,1.0,2.0,0.0,0.0,6.0,4.0,1.0,10.0,22.0,30.0,13.0,55.0,72.0,84.0,61.0,83.0,60.0,56.0,33.0,99.0,99.0,0.0,19.0,31.0,3.0,15.0,19.0,0.0,0.0,2.0,0.0,6.0,1.0,2.0,1.0,0.0,2.0,4.0,13.0,4.0,1.0,10.0,16.0,15.0,9.0,1.0,1.0,11.0,3.0,9.0,2.0,3.0,1.0,0.0,8.0,81.0,7.0,0.0,143.0,3.0,7.0,21.0,25.0,5.0,26.0,13.0,2.0,13.0,0.0,11.0,3.0,0.0,0.0,0.0,19.0,39.0,0.0,14.0,17.0,57.0,5.0,0.0,2.0,0.0,17.0,0.0,1.0,1.0,3.0,0.0,7.0,0.0,12.0,0.0,0.0,0.0,7.0,74.0,87.0,0.0,2.0,11.0,98.0,73.0,9.0,7.0,12.0,5.0,11.0,46.0,87.0,91.0,9.0,27.0,3.0,98.0,0.0,2.0,0.0,0.0,0.0,94.0,6.0,95.0,99.0,98.0,9.0,4.0,10.0,14.0,16.0,9702.0,39.0,6.0,12.0,47.0,3.0,1.0,10.0,9209.0,20.0,9304.0,17.0,9512.0,9209.0,9304.0,7.0,15.666667,88037.0,1.0,1.0,4.0,14.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8853,8601.0,0.0,1909.0,78.0,6.0,3.0,3.0,4.0,28.0,24.0,28.0,11.0,3.0,5.0,3.0,1145.0,291.0,380.0,99.0,0.0,0.0,45.0,55.0,89.0,1.0,0.0,1.0,78.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,0.0,0.0,3.0,34.0,44.0,49.0,37.0,47.0,51.0,28.0,34.0,42.0,24.0,13.0,17.0,20.0,12.0,14.0,16.0,8.0,14.0,13.0,33.0,21.0,19.0,39.0,11.0,33.0,22.0,26.0,52.0,34.0,21.0,11.0,52.0,13.0,8.0,27.0,213.0,299.0,97.0,95.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,501.0,510.0,3.0,3.0,79.0,21.0,98.0,2.0,0.0,40.0,77.0,54.0,28.0,89.0,11.0,12.0,3.0,10.0,10.0,23.0,2.0,20.0,48.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,50.0,0.0,3.0,46.0,53.0,53.0,15.0,6.0,2.0,0.0,0.0,19.0,2.0,0.0,0.0,6.0,22.0,53.0,78.0,2320.0,371.0,765.0,236.0,250.0,247.0,269.0,8931.0,32.0,24.0,23.0,16.0,2.0,2.0,0.0,0.0,0.0,27.0,23.0,28.0,16.0,3.0,3.0,0.0,0.0,0.0,29.0,8.0,28.0,21.0,35.0,65.0,9.0,74.0,19.0,2.0,2.0,0.0,0.0,0.0,3.0,2.0,2.0,0.0,16.0,17.0,2.0,61.0,65.0,80.0,53.0,73.0,44.0,49.0,61.0,63.0,0.0,7.0,12.0,3.0,2.0,11.0,34.0,0.0,2.0,18.0,0.0,9.0,5.0,3.0,1.0,0.0,0.0,2.0,11.0,2.0,2.0,6.0,33.0,7.0,8.0,4.0,0.0,2.0,8.0,5.0,9.0,11.0,3.0,5.0,0.0,76.0,5.0,0.0,120.0,15.0,9.0,43.0,17.0,5.0,9.0,1.0,2.0,32.0,2.0,25.0,7.0,2.0,4.0,0.0,13.0,28.0,1.0,24.0,35.0,28.0,22.0,0.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,67.0,37.0,62.0,1.0,1.0,90.0,45.0,15.0,15.0,38.0,0.0,0.0,2.0,2.0,8.0,92.0,0.0,0.0,99.0,0.0,0.0,0.0,0.0,0.0,99.0,0.0,98.0,98.0,94.0,3.0,2.0,7.0,5.0,30.0,9702.0,77.0,5.0,11.0,122.0,18.0,9.0,3.0,8705.0,15.0,9412.0,6.0,9603.0,8705.0,8708.0,3.0,6.777778,128727.0,1.0,2.0,31.0,21.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [35]:
list(X_train.columns).index(0)

326

In [38]:
X_train = X_train.iloc[:,:326]
X_test = X_test.iloc[:,:326]

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.8,
                             random_state = 42)

clf.fit(X_train, y_train)

print("The Accuracy for the Random Forest in the TRAIN set is {:.2f}".format(clf.score(X_train, y_train)))
print("The Accuracy for the Random Forest in the TEST  set is {:.2f}".format(clf.score(X_test, y_test)))

y_pred = clf.predict(X_test)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

The Accuracy for the Random Forest in the TRAIN set is 0.79
The Accuracy for the Random Forest in the TEST  set is 0.51


0    4813
1     238
Name: TARGET_B, dtype: int64

array([[2442, 2371],
       [ 124,  114]])

In [40]:
# For cross validation
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.8)

cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)

print("The mean Accuracy of the folds was {:.2f}".format(np.mean(cross_val_scores)))

The mean Accuracy of the folds was 0.50


In [41]:
cross_val_scores

array([0.52427184, 0.5       , 0.43414634, 0.52682927, 0.47804878,
       0.52195122, 0.49268293, 0.53170732, 0.45853659, 0.54146341])