In [3]:
# Loading Packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.decomposition import PCA

%matplotlib inline

In [None]:
def test_data_prep(filepath):
    
    # load and format data
    with open(filepath, 'r') as fp:
        data = fp.read()
        print('No. of records: ', len(data.split('\n')))

    data_lines = data.split('\n')
    headers = data_lines[0].split('\t')
    data = [d.split('\t') for d in data_lines[1:]]
    
    # Create dataframe
    df = pd.DataFrame(data=data, columns=np.array(headers))
    df = df.set_index('Index')
    df.head()
    
    # Change data types of columns
    df[['F1', 'F2', 'F3', 'F4']] = df[['F1', 'F2', 'F3', 'F4']].apply(pd.to_numeric)
    df[['F15', 'F16']] = df[['F15', 'F16']].apply(pd.to_datetime)
    for col in ['F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F17', 'F18', 'F19', 'F20', 'F21', 'F22']:
        df[col] = df[col].astype(np.int64)

    df.insert(loc=len(df.columns)-1, column='Date flag', value= (df['F16'] - df['F15']).dt.days)

    x = df[['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F17', 'F18', 'F19', 'F20', 'F21', 'F22', 'Date flag']]
    
    # Drop columns with high multicollinearity
    x.drop(['F19'], axis=1, inplace=True)
    x.drop(['F20'], axis=1, inplace=True)
    
    return x

X_test = test_data_prep(r'C:\Users\User\Downloads\BuyAffinity_Test.txt')
X_test_std = X_test[['F1', 'F2', 'F3', 'F4', 'F17', 'F18', 'F21', 'F22', 'Date flag']]
for col in ['F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13', 'F14', 'F17', 'F18', 'F21', 'F22', 'Date flag']:
    X_test_std[col] = standardize(X_test[col])
    

In [None]:
# test data
X_test_std.head()

In [None]:
# finding principle components
pca = PCA(n_components=1)

pc_F1_F4 = pca.fit_transform(X_test_std[['F1', 'F2', 'F3', 'F4']])
pc_F5_F9 = pca.fit_transform(X_test_std[['F5', 'F6', 'F7', 'F8', 'F9']])
pc_F10_F14 = pca.fit_transform(X_test_std[['F10', 'F11', 'F12', 'F13', 'F14']])

X_test_pca = X_test_std[['F17', 'F18', 'F21', 'F22', 'Date flag']]
X_test_pca['pc F1-F4'] = pc_F1_F4
X_test_pca['pc1 F5-F9'] = pc_F5_F9[:,0]
X_test_pca['pc1 F10-F14'] = pc_F10_F14[:,0]

X_test_pca.head()

In [None]:
# predicting on test data
y_pred_test = rf_model.predict(X_test_pca)
results_df = pd.DataFrame()
results_df['Index'] = X_test_pca.index
results_df['prediction'] = y_pred_test
results_df = results_df.set_index('Index')

In [None]:
# saving result to csv file
results_df[results_df['prediction'] == 1]
results_df.to_csv(r'C:\Users\User\Desktop\Test_Result.csv')