Install libraries

In [None]:
!pip install catboost

In [None]:
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, log_loss

Read the data

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
import io
df_test = pd.read_csv(io.BytesIO(uploaded['Test.csv']))
df_test

In [None]:
df_train.shape

Data Preprocessing

In [None]:
df_train['join_date'] = pd.to_datetime(df_train['join_date'], format="%d/%m/%Y")
df_train['join_days'] = (pd.to_datetime('1/8/2020', format="%d/%m/%Y")- df_train['join_date']).dt.days
df_train['age'] = 2020 - df_train['birth_year']
df_train.head()

Normalisation of Join_days and age

In [None]:
scaler = MinMaxScaler()
cols_to_norm = ['join_days','age']
df_train[cols_to_norm] = scaler.fit_transform(df_train[cols_to_norm])
df_train['sex'].replace('F', 0, inplace=True)
df_train['sex'].replace('M', 1, inplace=True)
df_train_scaled = df_train.drop(['ID', 'join_date', 'birth_year'], axis=1)
df_train_scaled

Test Data Preprocessing

In [None]:
df_test['join_date'] = pd.to_datetime(df_test['join_date'], format="%d/%m/%Y")
df_test['join_days'] = (pd.to_datetime('1/8/2020', format="%d/%m/%Y")- df_test['join_date']).dt.days
df_test['age'] = 2020 - df_test['birth_year']
df_test[cols_to_norm] = scaler.transform(df_test[cols_to_norm])
df_test['sex'].replace('F', 0, inplace=True)
df_test['sex'].replace('M', 1, inplace=True)
df_test_scaled = df_test.drop(['ID', 'join_date', 'birth_year'], axis=1)
df_test_scaled

One Hot encoding of categorical features

In [None]:
df_train_scaled

In [None]:
df_train_scaled["join_days"]= df_train_scaled["join_days"].fillna(0)
df_test_scaled["join_days"]= df_test_scaled["join_days"].fillna(0)
df_test_scaled

Function to predict all products

In [None]:
def predict_all_products(train_data, test_data, products_list):  
  predicted_data = pd.DataFrame(columns = products_list)
  predicted_proba = pd.DataFrame()
  f1_scores = []
  models = {}
  #accuracy_score(y_test, y_pred)
  for product in products_list:
    target_col = product
    X = train_data.loc[:,train_data.columns != target_col]
    y = train_data.loc[:, target_col]
    X_eval = test_data.loc[:, test_data.columns != target_col]
    y_eval = test_data.loc[:, target_col]    
    X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75)
    
    cat_features = ['marital_status', 'branch_code', 'occupation_code', 'occupation_category_code']

    # Initialize CatBoostClassifier
    model = CatBoostClassifier(eval_metric="Logloss") #, learning_rate=0.2,task_type='GPU',

    # Fit model
    model.fit(X_train, y_train, cat_features= cat_features, eval_set=(X_validation, y_validation), plot=True) #eval_set=(X_eval, y_eval)
    #f1_scores.append(f1_score(y_test, y_pred))
    y_eval_pred = model.predict(X_eval)
    
    models[product] = model
    #prob_0=product+'_0'
    #prob_1=product+'_1'
    y_eval_pred_proba = model.predict_proba(X_eval)
    predicted_data[target_col] = y_eval_pred
    # Probabilities of product 0 and 1    
    #predicted_proba[prob_0] = y_eval_pred_proba[:,0]
    predicted_proba[product] = y_eval_pred_proba[:,1]
    
  return models, predicted_data, predicted_proba
    

In [None]:
products = ['P5DA', 'RIBP', '8NN1', '7POT',	'66FJ',	'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW',	'GHYX',	'ECY3']
prediction_data = []
models, prediction_data, prediction_proba = predict_all_products(df_train_scaled, df_test_scaled, products)

Function to predict a single product


In [None]:
# global variables
predicted_prod_data = pd.DataFrame()
predicted_prod_proba = pd.DataFrame()
def predict_single_product(all_data, train_data, test_data, product):  
  
  
  all_data[['sex', 'P5DA', 'RIBP', '8NN1', '7POT',	'66FJ',	'GYSR', 'SOP4', 'RVSZ',
              'PYUQ', 'LJR9', 'N2MW', 'AHXO', 'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D',
              'J9JW',	'GHYX',	'ECY3', 'Weight']] = all_data[['sex', 'P5DA', 'RIBP', '8NN1', '7POT',	'66FJ',
                                                               'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO', 'BSTQ',
                                                               'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW',	'GHYX',	'ECY3',
                                                               'Weight']].apply(pd.to_numeric)

  
  target_col = product
  # Training data
  weights = 'Weight'
  weight = all_data.loc[:, weights].tolist()
  del all_data[weights]
  #removal_data =product.append(weight)
  
  X_eval = all_data.loc[:,all_data.columns != target_col]   
  #X_train = X_train.loc[:,X_train.columns != weights]
  y_eval = all_data.loc[:, target_col]
  y_eval.astype(float)  
  # Test data
  X_test = test_data.loc[:, test_data.columns != target_col]
  y_test = test_data.loc[:, target_col]
  #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)  
  cat_features = ['marital_status', 'branch_code', 'occupation_code', 'occupation_category_code']
  
  eval_data = Pool(data= X_eval, label=y_eval, cat_features= cat_features, weight=weight)

  X_train = train_data.loc[:,train_data.columns != target_col]
  y_train = train_data.loc[:, target_col]  

  model = CatBoostClassifier(iterations=1600, eval_metric="Logloss") #, learning_rate=0.2,task_type='GPU',

  # Fit model
  model.fit(X_train, y_train, eval_set=eval_data, cat_features= cat_features, plot=True) 
  
  y_eval_pred = model.predict(X_test)   
  y_eval_pred_proba = model.predict_proba(X_test)
  predicted_prod_data[target_col] = y_eval_pred
  #df = pd.DataFrame(data=y_eval_pred_proba, columns=[prob_0, prob_1])
  #predicted_proba[prob_0] = y_eval_pred_proba[:,0]
  predicted_prod_proba[product] = y_eval_pred_proba[:,1]
  # Calculate Loss
  #loss = log_loss(y_eval, y_eval_pred_proba, labels=[0,1])
  #predicted_proba.join(df)
  return #, loss
    

In [None]:
# For each product, calculating the probabilities
df_test_expanded=pd.DataFrame()
#predicted_prod_data=pd.DataFrame()
#predicted_prod_proba=pd.DataFrame()
for product in products:
  # Filtering the data for every column (product) = 1
  df_product = df_test_scaled.loc[df_test_scaled[product] == 1]  
  remaining_products = products.copy()
  remaining_products.remove(product)
  checked_products = [product]
  # Dataframe for each product with new derived data to run prediction  
  df_product_new_data = pd.DataFrame(columns=df_test_scaled.columns)  
  # adding new column
  df_product_new_data['Weight'] = 0    
  product_new_data_list = []                                                     
  #print(df_product)
  first_run = True
  for index, row in df_product.iterrows():
    for indiv_product in remaining_products:      
      # Creation of new row only when the product value is 0
      if(row[indiv_product] != 1):        
        # Update weight with probability
        weight = prediction_proba.loc[index, indiv_product]  #[indiv_product][index]
        #temp.append(weight)
        # Create new rows for product with weights        
        product_new_data_row_dict= row.to_dict()
        product_new_data_row_dict['Weight'] = weight
        product_new_data_row_dict[indiv_product] = 1
        product_new_data_list.append(product_new_data_row_dict.values())
  df_product_new_data = pd.DataFrame(product_new_data_list, columns = df_product_new_data.columns)                
  
  predict_single_product(df_product_new_data, df_train_scaled, df_test_scaled, product)
  



In [None]:
# Count the number of misclassification
# wrong_pred_count=0
# for product in products:
#   for row in range(len(predicted_prod_data.index)):
#     if predicted_prod_data[product][row] != df_test_scaled[product][row] and df_test_scaled[product][row] == 1:
#       print(product,row)
#       wrong_pred_count += 1      
# wrong_pred_count

Normalizing the probabilities

In [None]:
prediction_proba = prediction_proba.div(prediction_proba.sum(axis=1), axis=0)
prediction_proba

Submission File Generation

In [None]:
#df_submission=pd.DataFrame(columns=['ID X PCODE', 'Label'])
data_submission = []
start_idx = 0
for product in products:
  for row in range(len(prediction_data.index)):
    #df_submission['ID X PCODE'][start_idx]= df_test['ID'][row] + ' X ' + product
    product_ID = df_test['ID'][row] + ' X ' + product
    if df_test[product][row] != 1:      
      #prob_1 = product + '_1'
      prediction_value = prediction_proba[product][row]     
    else:
      prediction_value = 1      
    data_submission.append([product_ID, prediction_value])
    #start_idx += 1
df_submission = pd.DataFrame(data_submission, columns=['ID X PCODE', 'Label'])


In [None]:
df_submission.to_csv('submission.csv', index=False)
from google.colab import files
files.download("submission.csv")