# Outline

1. Imports
1. Opening the Data
1. Fill NaNs
1. Dummy Variables
1. Training and Testing

#### There is no need to do feature scaling here, since Decision Tree types of algorithms don't demand it.

## 1. Imports

In [1]:
import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.ensemble import GradientBoostingClassifier

## 2. Opening the Data

In [None]:
df_test_small = pd.read_csv('orange_small_test.data', sep = '\t')

df_num_test = df_test_small.iloc[:, :-40]
df_cat_test = df_test_small.iloc[:, -40:]
    
df_num_test = df_num_test.astype('float')
df_cat_test = df_cat_test.astype('category')

## 3. Fill NaNs

In [None]:
mean_train = pd.read_csv('train_mean.csv', 
                         names = ['Variable', 'Mean'],
                         skiprows = 1).set_index('Variable')
mean_train = pd.Series(mean_train['Mean'])

In [None]:
df_num_test = df_num_test.fillna(mean_train)

#### Running the following twice will give an error due to adding the same category twice.

In [None]:
for col in df_cat_test.columns:
    df_cat_test[col] = df_cat_test[col].cat.add_categories('missing')
    df_cat_test[col] = df_cat_test[col].fillna('missing')
    
df_all = pd.concat([df_num_test, df_cat_test], axis = 1)

## 4. Dummy Variables

### Dummy Variables Encoding


Probably can delete one of the columns for each variable. **Be Careful** this line might crash your computer's RAM.

In [None]:
df_dummies = pd.get_dummies(df_all)

#### Opening the training columns. Use this instead of burning your RAM if you can.

In [None]:
with open('variables.json', 'r') as file:
    columns_thresh = json.load(file)

#### The columns on the test set are different, so blindly using the ones of the train set is not a thing, I have to test each one.

In [None]:
columns_dummies_test = list(df_dummies.columns)
columns_test = {}
for k, v in columns_thresh.items():
    columns_test[k] = []
    for var in v:
        if var in columns_dummies_test:
            columns_test[k].append(var)

#### Now the `dict_thresh` for the test set.

In [None]:
dict_thresh = {k : df_dummies[v]
               for k, v in columns_test.items()}

#### Saving the Datasets

In [None]:
for k, v in dict_thresh.items():
    v.to_csv(f'test_thresh_{k}.csv')

#### Reopening them

In [None]:
dict_thresh_ext = {}
for k, v in columns_thresh.items():
    dict_thresh_ext[k] = pd.read_csv(f'test_thresh_{k}.csv',
                                     index_col = 0)

## 6. Training and Testing

#### Training Set 

In [None]:
dict_thresh_train = {}
for k, v in dict_thresh_ext.items():
    dict_thresh_train = pd.read_csv(f'train_thresh_{k}.csv',
                                    index_col = 0)

#### Training Targets

In [None]:
train_targets = {}
train_targets['Churn'] = pd.read_csv('orange_small_train_churn.labels', 
                                     header = None, names = ['churn'])
train_targets['Upselling'] = pd.read_csv('orange_small_train_upselling.labels', 
                                         header = None, names = ['upselling'])
train_targets['Appetency'] = pd.read_csv('orange_small_train_appetency.labels', 
                                         header = None, names = ['appetency'])

In [None]:
for k, v in train_targets.items():
    train_targets[k] = v.astype('category')

In [2]:
clf_gb_ult = {
              'Churn' : GradientBoostingClassifier(n_estimators = 500, 
                                                   learning_rate = 0.05,
                                                   min_samples_split = 150,
                                                   min_samples_leaf = 20,
                                                   max_features = 35,
                                                   subsample = 0.75,
                                                   random_state = 42),
              'Upselling' : GradientBoostingClassifier(n_estimators = 300, 
                                                       learning_rate = 0.1,
                                                       min_samples_split = 200,
                                                       min_samples_leaf = 45,
                                                       max_features = 38,
                                                       subsample = 1,
                                                       random_state = 42),
              'Appetency' : GradientBoostingClassifier(n_estimators = 300, 
                                                       learning_rate = 0.05,
                                                       min_samples_split = 100,
                                                       min_samples_leaf = 55,
                                                       max_features = 38,
                                                       subsample = 1,
                                                       random_state = 42),
             }

#### Matching columns on both the training and test sets.

In [None]:
for k, v in dict_thresh_train.items():
    vars_test = dict_thresh_ext[k].columns.tolist()
    for col in dict_thresh_train[k].columns.tolist():
        if col not in vars_test:
            del dict_thresh_train[k][col]

#### Fitting and Predicting

In [None]:
predictions = {}
for k, v in clf_gb_ult.items():
    
    clf_gb_ult[k].fit(dict_thresh_train[k], train_targets[k])
    
    predictions[k] = clf_gb_ult[k].predict(dict_thresh_ext[k])

#### Saving 

In [None]:
for k, v in predictions.items():
    df_pred_final = pd.DataFrame(v)
    df_pred_final.to_csv(f'predictions_test_final_{k}.csv', 
                         header = None,
                         index = False)