# Predicting using only some features

The purpose of this notebook is to create a model based on the features which has the highest correlation with target and then remove the ones that has correlation above 0.75 between them

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
%matplotlib inline

input_path = Path('/kaggle/input/amex-default-prediction/')

In [None]:
# get only the first 1M to get the correlation
train_data_first = pd.read_csv(
    input_path / 'train_data.csv',
    index_col='customer_ID',
    nrows=1_000_000)

train_labels_first = pd.read_csv(input_path / 'train_labels.csv', index_col='customer_ID', nrows=1_000_000)

In [None]:
# get only the labels of the customers of the first 1M rows of the train data
train_labels_first = train_labels_first[train_labels_first.index.isin(train_data_first.index)]

In [None]:
# We are going to use only the last month
last_month_train_data_first = train_data_first.groupby('customer_ID').tail(1)

In [None]:
last_month_train_data_first = last_month_train_data_first.merge(train_labels_first, on='customer_ID', 
                                               how='inner', validate='one_to_one')

In [None]:
# we get the first 20
best_20_pred = last_month_train_data_first.corr()['target'].abs().sort_values(ascending=False).index[1:21]

In [None]:
cor_matrix = last_month_train_data_first[best_20_pred].corr()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.75)]

predictors = list(set(best_20_pred).difference(set(to_drop)))

In [None]:
train_data = pd.read_csv(
    input_path / 'train_data.csv',
    usecols=predictors+['customer_ID'])

train_labels = pd.read_csv(input_path / 'train_labels.csv', index_col='customer_ID')

last_month_train_data = train_data.groupby('customer_ID').tail(1)

last_month_train_data = last_month_train_data.merge(train_labels, on='customer_ID', how='inner',
                                                    validate='one_to_one')

In [None]:
rf = RandomForestRegressor()
rf.fit(last_month_train_data[predictors].fillna(-999), 
       last_month_train_data['target'])

In [None]:
test_data = pd.read_csv(
    input_path / 'test_data.csv',
    usecols=predictors+['customer_ID'])

In [None]:
last_month_test_data = test_data.groupby('customer_ID').tail(1)

In [None]:
last_month_test_data['prediction'] = rf.predict(last_month_test_data[predictors].fillna(-999))

In [None]:
last_month_test_data[['customer_ID', 'prediction']].to_csv('submission.csv', index=False)