In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

#Reading train and test data
train_data = pd.read_csv("/kaggle/input/income-qualification/train.csv")
test_data = pd.read_csv("/kaggle/input/income-qualification/test.csv")

In [None]:
#Analyzing the train data
print(train_data.head)

#Target variable has 4 values corresponding to:
#1 = extreme poverty
#2 = moderate poverty
#3 = vulnerable households
#4 = non vulnerable households

In [None]:
print(train_data.describe())

In [None]:
#check if the data is imbalanced
target_count = train_data['Target'].value_counts()
target_count.plot(kind="bar", title="Target count")

**Since data is biased and consists of more data corresponding to non-vulnerable households, we need to balance the data.
I'm using oversampling technique to balance the data.**

In [None]:
#Data preprocessing

#check for null values
train_data.isnull().any().any()

In [None]:
#Drop the ID column
train_data.drop("Id", axis=1, inplace=True)

In [None]:
#Converting string columns to integer columns
#from sklearn import preprocessing

#encoder = preprocessing.LabelEncoder()

print(train_data.dtypes)
for columns in train_data.columns:
    if (train_data[columns].dtype == "object"):
        (train_data[columns], uniques) = pd.factorize(train_data[columns])

print(train_data.dtypes)

In [None]:
train_data = train_data.fillna(train_data.mean())
print(train_data.isnull().any().any())

In [None]:
print(train_data.dtypes)

In [None]:
import imblearn
from imblearn.over_sampling import RandomOverSampler
from sklearn import preprocessing

#dividing X and Y in data
X = np.array(train_data.iloc[:, train_data.columns != 'Target'])
Y = np.array(train_data.iloc[:, train_data.columns == 'Target'])
print("X_data size is ", X.shape, " Y_data size is ", Y.shape)

min_max_scaler = preprocessing.MinMaxScaler()
X_normalized = min_max_scaler.fit_transform(X)
X = pd.DataFrame(X_normalized)

ros = RandomOverSampler()
X_oversampled, Y_oversampled = ros.fit_sample(X, Y)
print("X_data size is ", X_oversampled.shape, " Y_data size is ", Y_oversampled.shape)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
principalComponents = pca.fit_transform(X_oversampled)
X_oversampled = principalComponents
principalDf = pd.DataFrame(principalComponents)
print(principalDf.head)

In [None]:
from sklearn.model_selection import train_test_split

#Splitting training and testing data
(X_train, X_Test, Y_train, Y_Test) = train_test_split(X_oversampled, Y_oversampled, test_size = 0.33, stratify = Y_oversampled, random_state=1)

In [None]:
#Using Random Forest as Classifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_train, Y_train)


In [None]:
from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_Test)
accuracy_score(Y_Test, y_pred)

In [None]:
#Randomforest using cross validation
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42)
# Fit the random search model
rf_random.fit(X_train, Y_train)

In [None]:
#Best parameters are:
rf_random.best_params_

In [None]:
base_model = RandomForestClassifier(n_estimators = 500,min_samples_split = 2,min_samples_leaf = 1, max_features= 'sqrt',max_depth=35, bootstrap= False, random_state = 42)
base_model.fit(X_train, Y_train)

In [None]:
from sklearn.metrics import accuracy_score

y_pred_regres = base_model.predict(X_Test)
accuracy_score(Y_Test, y_pred_regres)

In [None]:
print(y_pred)
print(y_pred_regres)