In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import resample

In [2]:
categorical = pd.read_csv('categorical.csv')
numerical = pd.read_csv('numerical.csv')
target = pd.read_csv('target.csv')

data = pd.concat([categorical,numerical,target],axis=1)

In [3]:
data.head()

Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,TARGET_B,TARGET_D
0,IL,36,H,F,3,L,E,C,T,2,...,12.0,10.0,4,7.741935,95515,0,4,39,0,0.0
1,CA,14,H,M,3,L,G,A,S,1,...,25.0,25.0,18,15.666667,148535,0,2,1,0,0.0
2,NC,43,U,M,3,L,E,C,R,2,...,16.0,5.0,12,7.481481,15078,1,4,60,0,0.0
3,CA,44,U,F,3,L,E,C,R,2,...,11.0,10.0,9,6.8125,172556,1,4,41,0,0.0
4,FL,16,H,F,3,L,F,A,S,2,...,15.0,15.0,14,6.864865,7112,1,2,26,0,0.0


In [4]:
data['TARGET_B'].value_counts()


0    90569
1     4843
Name: TARGET_B, dtype: int64

In [5]:
data.isna().sum()

STATE       0
CLUSTER     0
HOMEOWNR    0
GENDER      0
DATASRCE    0
           ..
HPHONE_D    0
RFA_2F      0
CLUSTER2    0
TARGET_B    0
TARGET_D    0
Length: 339, dtype: int64

In [6]:
y = data['TARGET_B']
X = data.drop(['TARGET_B', 'TARGET_D'], axis = 1)

numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(np.object)

# we OneHotEncode the categoricals 
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
X = pd.concat([numericalX, encoded_categorical], axis = 1)

# train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categoricalX = X.select_dtypes(np.object)


In [7]:
# for downsampling we need to temporarily concat X_train and y_train
trainset = pd.concat([X_train, y_train], axis=1)

# quicker way to downsample category 0:
category_0 = trainset[trainset['TARGET_B'] == 0]
print(category_0.shape)
category_1 = trainset[trainset['TARGET_B'] == 1]
print(category_1.shape)

(72486, 355)
(3843, 355)


In [9]:
#Random Forest
category_1_oversampled = resample(category_1, 
                                  replace=True, 
                                  n_samples = len(category_0))


data_upsampled = pd.concat([category_0, category_1_oversampled], axis = 0)
X_train_upsampled = data_upsampled.drop(['TARGET_B'], axis=1)
y_train_upsampled = data_upsampled['TARGET_B']
print(X_train.shape)

(76329, 354)


In [10]:
from sklearn.ensemble import RandomForestClassifier
# Fit the Random Forests algorithm on the training data
rfc = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2,
                             random_state = 42)
rfc.fit(X_train_upsampled, y_train_upsampled)
print(rfc.score(X_train_upsampled,y_train_upsampled))
print(rfc.score(X_test,y_test))
# Make predictions on the test data
y_pred = rfc.predict(X_test)



0.6223960488921999




0.5967091128229314




In [11]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# Calculate the confusion matrix of the model
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion matrix:\n', conf_matrix)

# For cross validation
cross_val_scores = cross_val_score(rfc, X_train_upsampled, y_train_upsampled, cv=10)
print(np.mean(cross_val_scores))

Accuracy: 0.5967091128229314
Confusion matrix:
 [[10833  7250]
 [  446   554]]




0.6171811735172079


## Use Feature Selections that you have learned in class to decide if you want to use all of the features (Variance Threshold, RFE, PCA, etc.)

In [12]:
from sklearn.feature_selection import VarianceThreshold, RFE
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
import statsmodels.api as sm


In [13]:
X_added_constant = sm.add_constant(X_train_upsampled)
X_added_constant
model = sm.OLS(y_train_upsampled,X_added_constant).fit()
model.summary()

0,1,2,3
Dep. Variable:,TARGET_B,R-squared:,0.074
Model:,OLS,Adj. R-squared:,0.072
Method:,Least Squares,F-statistic:,32.69
Date:,"Sun, 16 Apr 2023",Prob (F-statistic):,0.0
Time:,16:33:44,Log-Likelihood:,-99640.0
No. Observations:,144972,AIC:,200000.0
Df Residuals:,144617,BIC:,203500.0
Df Model:,354,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-8.6287,0.508,-16.991,0.000,-9.624,-7.633
CLUSTER,0.0044,0.001,5.395,0.000,0.003,0.006
DATASRCE,-0.0079,0.002,-3.413,0.001,-0.012,-0.003
DOMAIN_B,-0.0306,0.004,-6.814,0.000,-0.039,-0.022
ODATEW_YR,0.0018,0.002,0.818,0.414,-0.002,0.006
ODATEW_MM,0.0264,0.005,5.085,0.000,0.016,0.037
DOB_YR,-0.0001,7.4e-05,-1.992,0.046,-0.000,-2.37e-06
DOB_MM,0.0034,0.000,7.517,0.000,0.003,0.004
MINRDATE_YR,0.0031,0.001,3.914,0.000,0.002,0.005

0,1,2,3
Omnibus:,612550.883,Durbin-Watson:,0.148
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17782.173
Skew:,-0.01,Prob(JB):,0.0
Kurtosis:,1.284,Cond. No.,45600000.0


In [14]:
# Fit the Random Forests algorithm on the training data
rfc = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2,
                             random_state = 42)
rfc.fit(X_train_upsampled, y_train_upsampled)
print(rfc.score(X_train_upsampled,y_train_upsampled))
print(rfc.score(X_test,y_test))
# Make predictions on the test data
y_pred = rfc.predict(X_test)



0.6223960488921999




0.5967091128229314


