# <font color = 'teal'><center> Feature Engineering & Data preparation

In [1]:
import pandas as pd
import numpy as np
import pickle
from scipy.sparse import hstack
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

## Handling categorical features

In EDA we had found some pairs of categorical feature that have high amount of correlation among them, so it will be better if I  drop them and threshold selected is 0.9

In [3]:
#Categorical column names to be dropped
cat_drop = ['cat2','cat3','cat4','cat5','cat6','cat7','cat8','cat9','cat50','cat71','cat86',
           'cat95','cat96','cat98','cat104']
train_df.drop(cat_drop,axis=1,inplace=True)
test_df.drop(cat_drop,axis=1,inplace=True)

>There are multiple ways to encode categorical feature. I have also gone through some of the [blogs](https://medium.com/kaggle-blog/allstate-claims-severity-competition-2nd-place-winners-interview-alexey-noskov-f4e4ce18fcfc) related to this competition discussion about ways used to encode features. I am not going to use label encoding or lexical encoding or one hot encoding. <br><br>
<font color='chocolate'>My strategy is to combine all these 101 features in to one new column. Which means that each of the row in that new column will contain 101 words long text data(here I am considering single letter as a word). <br>
   
**So, below cell shows how each row will look like in the new column after all 101 columns values are joined using space between them, for any one data point.**

In [4]:
word = list(train_df.loc[0,'cat1':'cat116'])
print(len(word))
print(word)

101
['A', 'A', 'B', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'A', 'D', 'B', 'B', 'D', 'D', 'B', 'D', 'C', 'B', 'B', 'A', 'A', 'A', 'A', 'A', 'D', 'B', 'A', 'T', 'B', 'G', 'A', 'A', 'E', 'G', 'J', 'G', 'BU', 'BC', 'C', 'AS', 'S', 'A', 'O', 'LB']


Now, after joining above list with space will give me a document(in terms of NLP) of lenght 101 words. Shown in below cells. 
This will be done with all the rows, and resultant will be a feature which can be considered as text column by me. 

In [5]:
print(' '.join(word))

A A B A A A A A A A A A A A B A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A B A D B B D D B D C B B A A A A A D B A T B G A A E G J G BU BC C AS S A O LB


I will be discarding all categorical columns as now all those informations are stored in the form of a new column. <br>
Now the question is how to vectorize this text feature that we are going to create in upcoming cells?
* I can write down my own custom code for it or use sklearn's Countvectorizer with prediefined vocabulary list. 
* While applying countvectorizer, I can extract unigram, bigram and trigram features from it.
* Also I can try tfidf featurization.

In [6]:
def new_feature(row, df):
    word_list = np.array(df.loc[row,'cat1':'cat116'])
    text = ' '.join(word_list)
    return text

In [7]:
train_df['text'] = None
test_df['text'] = None
for i in tqdm(range(train_df.shape[0])):
    train_df.loc[i,'text'] = new_feature(i,train_df)

for i in tqdm(range(test_df.shape[0])):
    test_df.loc[i,'text'] = new_feature(i, test_df)

100%|█████████████████████████████████████████████████████████████████████████| 188318/188318 [22:49<00:00, 137.46it/s]
100%|█████████████████████████████████████████████████████████████████████████| 125546/125546 [09:07<00:00, 229.51it/s]


In [8]:
#Saving this text columns:
with open('train_text.pkl','wb') as f:
    pickle.dump(train_df['text'],f)
    
with open('test_text.pkl','wb') as f:
    pickle.dump(test_df['text'],f)

In [9]:
#Dropping rest of the categorical columns
cat = [col for col in train_df.columns if 'cat' in col]
train_df.drop(cat,axis=1,inplace=True)
test_df.drop(cat,axis=1,inplace=True)
print(train_df.columns)

Index(['id', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14',
       'loss', 'text'],
      dtype='object')


In [10]:
#Finding unique words in whole new column: https://stackoverflow.com/a/38558245
vocab = train_df['text'].str.split(' ', expand= True).stack().unique()
print(len(vocab))
print(vocab)

338
['A' 'B' 'D' 'C' 'T' 'G' 'E' 'J' 'BU' 'BC' 'AS' 'S' 'O' 'LB' 'L' 'F' 'I'
 'K' 'BI' 'CQ' 'AV' 'BM' 'DP' 'H' 'AB' 'DK' 'AF' 'GK' 'CS' 'N' 'AE' 'DJ'
 'P' 'Y' 'CK' 'EB' 'AH' 'LO' 'Q' 'M' 'AX' 'IE' 'DW' 'U' 'LY' 'AM' 'GS'
 'AI' 'HK' 'EG' 'AK' 'DC' 'MP' 'DS' 'CL' 'LE' 'HQ' 'BS' 'R' 'HJ' 'AP' 'GC'
 'BY' 'AD' 'BT' 'HX' 'HL' 'AL' 'AN' 'HG' 'CO' 'MD' 'LF' 'LM' 'CM' 'CB'
 'EL' 'AW' 'AJ' 'AY' 'AT' 'KQ' 'W' 'EE' 'AR' 'AC' 'HN' 'LQ' 'AU' 'DX' 'AQ'
 'KW' 'CD' 'IT' 'LN' 'CI' 'CW' 'LC' 'DT' 'GX' 'GE' 'BG' 'CP' 'BO' 'HB'
 'GI' 'GM' 'CR' 'JR' 'BD' 'HA' 'V' 'BF' 'BK' 'BA' 'AO' 'LJ' 'IH' 'AG' 'HV'
 'DM' 'GU' 'HM' 'CY' 'IC' 'EF' 'BJ' 'KD' 'KI' 'DL' 'DA' 'DN' 'X' 'MG' 'LL'
 'KN' 'BQ' 'AA' 'LH' 'BP' 'DF' 'EY' 'LW' 'KA' 'EK' 'EO' 'DH' 'CG' 'HC'
 'DI' 'BN' 'FB' 'IG' 'FR' 'CF' 'BL' 'EC' 'KR' 'HI' 'BH' 'IU' 'MC' 'JW'
 'FH' 'IF' 'CH' 'KL' 'LX' 'EM' 'IL' 'KB' 'IQ' 'JX' 'GN' 'FD' 'ME' 'KC'
 'FT' 'CT' 'GL' 'ES' 'JL' 'BX' 'II' 'HP' 'ED' 'CU' 'EN' 'FG' 'MJ' 'KE'
 'DD' 'EI' 'FX' 'CJ' 'EA' 'KP' 'EP' 'FC' 'GB' 'JU' 'L

## 1.1. Vectorizing categorical features using CountVectorizer
* Unigram Bag-of-word(Bow)
* Bigram BOW
* Trigram BOW

In [11]:
# Creating unigram features
vect = CountVectorizer(token_pattern=r"(?u)\b\w+\b",lowercase=False)
cat_vect_train = vect.fit_transform(train_df['text'])
cat_vect_test = vect.transform(test_df['text'])
print('Some feature names: ', vect.get_feature_names()[:100])
print('Length of unigram features: ',len(vect.get_feature_names()))

Some feature names:  ['A', 'AA', 'AB', 'AC', 'AD', 'AE', 'AF', 'AG', 'AH', 'AI', 'AJ', 'AK', 'AL', 'AM', 'AN', 'AO', 'AP', 'AQ', 'AR', 'AS', 'AT', 'AU', 'AV', 'AW', 'AX', 'AY', 'B', 'BA', 'BB', 'BC', 'BD', 'BE', 'BF', 'BG', 'BH', 'BI', 'BJ', 'BK', 'BL', 'BM', 'BN', 'BO', 'BP', 'BQ', 'BR', 'BS', 'BT', 'BU', 'BV', 'BW', 'BX', 'BY', 'C', 'CA', 'CB', 'CC', 'CD', 'CE', 'CF', 'CG', 'CH', 'CI', 'CJ', 'CK', 'CL', 'CM', 'CN', 'CO', 'CP', 'CQ', 'CR', 'CS', 'CT', 'CU', 'CV', 'CW', 'CX', 'CY', 'D', 'DA', 'DB', 'DC', 'DD', 'DE', 'DF', 'DG', 'DH', 'DI', 'DJ', 'DK', 'DL', 'DM', 'DN', 'DO', 'DP', 'DQ', 'DR', 'DS', 'DT', 'DU']
Length of unigram features:  338


In [12]:
with open('unigram_train.pkl','wb') as f:
    pickle.dump(cat_vect_train,f)

with open('unigram_test.pkl','wb') as f:
    pickle.dump(cat_vect_test,f)

In [15]:
# Creating bigram features
vect = CountVectorizer(token_pattern=r"(?u)\b\w+\b",lowercase=False, ngram_range=(2,2))
cat_vect_train = vect.fit_transform(train_df['text'])
cat_vect_test = vect.transform(test_df['text'])
print('Some feature names: ', vect.get_feature_names()[:100])
print('Length of bigram features: ',len(vect.get_feature_names()))

Some feature names:  ['A A', 'A AA', 'A AB', 'A AC', 'A AD', 'A AE', 'A AF', 'A AG', 'A AH', 'A AI', 'A AJ', 'A AK', 'A AL', 'A AM', 'A AN', 'A AO', 'A AP', 'A AQ', 'A AR', 'A AS', 'A AT', 'A AU', 'A AV', 'A AW', 'A AX', 'A AY', 'A B', 'A BA', 'A BB', 'A BC', 'A BD', 'A BF', 'A BG', 'A BH', 'A BI', 'A BJ', 'A BK', 'A BM', 'A BN', 'A BO', 'A BT', 'A C', 'A CH', 'A CJ', 'A CL', 'A CR', 'A D', 'A E', 'A EB', 'A F', 'A FE', 'A FU', 'A G', 'A GA', 'A GB', 'A GK', 'A H', 'A I', 'A IC', 'A J', 'A JL', 'A JQ', 'A JR', 'A JW', 'A JX', 'A JY', 'A K', 'A KA', 'A KB', 'A KC', 'A KD', 'A KW', 'A L', 'A LF', 'A LN', 'A LO', 'A M', 'A MC', 'A MD', 'A ME', 'A MO', 'A MP', 'A N', 'A O', 'A P', 'A Q', 'A R', 'A S', 'A T', 'A U', 'A V', 'A W', 'A X', 'A Y', 'AA A', 'AA AD', 'AA AE', 'AA AF', 'AA AG', 'AA AH']
Length of bigram features:  4361


In [16]:
with open('bigram_train.pkl','wb') as f:
    pickle.dump(cat_vect_train,f)

with open('bigram_test.pkl','wb') as f:
    pickle.dump(cat_vect_test,f)

In [34]:
# Creating trigram features
vect = CountVectorizer(token_pattern=r"(?u)\b\w+\b",lowercase=False, ngram_range=(3,3))
cat_vect_train = vect.fit_transform(train_df['text'])
cat_vect_test = vect.transform(test_df['text'])
print('Some feature names: ', vect.get_feature_names()[:100])
print('Length of trigram features: ',len(vect.get_feature_names()))

Some feature names:  ['A A A', 'A A AA', 'A A AB', 'A A AD', 'A A AE', 'A A AF', 'A A AG', 'A A AH', 'A A AI', 'A A AJ', 'A A AK', 'A A AL', 'A A AM', 'A A AN', 'A A AO', 'A A AP', 'A A AR', 'A A AS', 'A A AT', 'A A AU', 'A A AV', 'A A AW', 'A A AX', 'A A AY', 'A A B', 'A A BA', 'A A BC', 'A A BD', 'A A BF', 'A A BG', 'A A BJ', 'A A BK', 'A A BM', 'A A BN', 'A A BO', 'A A C', 'A A D', 'A A E', 'A A F', 'A A FU', 'A A G', 'A A GA', 'A A GB', 'A A GK', 'A A H', 'A A I', 'A A J', 'A A JL', 'A A JQ', 'A A JR', 'A A JW', 'A A JX', 'A A JY', 'A A K', 'A A KA', 'A A KB', 'A A KD', 'A A L', 'A A LF', 'A A LN', 'A A LO', 'A A M', 'A A MC', 'A A MD', 'A A MO', 'A A MP', 'A A N', 'A A O', 'A A P', 'A A Q', 'A A R', 'A A S', 'A A T', 'A A U', 'A A V', 'A A X', 'A A Y', 'A AA A', 'A AA AD', 'A AA AE', 'A AA AF', 'A AA AG', 'A AA AH', 'A AA AJ', 'A AA AK', 'A AA AM', 'A AA AN', 'A AA AO', 'A AA AS', 'A AA AT', 'A AA AU', 'A AA AV', 'A AA AW', 'A AA AX', 'A AA AY', 'A AA BA', 'A AA BB', 'A AA BC', 'A

In [35]:
with open('trigram_train.pkl','wb') as f:
    pickle.dump(cat_vect_train,f)

with open('trigram_test.pkl','wb') as f:
    pickle.dump(cat_vect_test,f)

## 1.2. Vectorizing categorical features using TfidfVectorizer
* Unigram Tfidf feature
* Bigram Tfidf feature
* Trigram Tfidf feature

In [19]:
#Creating tfidf unigram
vect = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b",lowercase=False)
cat_vect_train = vect.fit_transform(train_df['text'])
cat_vect_test = vect.transform(test_df['text'])

In [20]:
with open('unigram_train_tfidf.pkl','wb') as f:
    pickle.dump(cat_vect_train,f)

with open('unigram_test_tfidf.pkl','wb') as f:
    pickle.dump(cat_vect_test,f)

In [21]:
#Creating tfidf bigram
vect = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b",lowercase=False, ngram_range=(2,2))
cat_vect_train = vect.fit_transform(train_df['text'])
cat_vect_test = vect.transform(test_df['text'])

In [22]:
with open('bigram_train_tfidf.pkl','wb') as f:
    pickle.dump(cat_vect_train,f)

with open('bigram_test_tfidf.pkl','wb') as f:
    pickle.dump(cat_vect_test,f)

In [23]:
#Creating tfidf trigram
vect = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b",lowercase=False, ngram_range=(3,3))
cat_vect_train = vect.fit_transform(train_df['text'])
cat_vect_test = vect.transform(test_df['text'])

In [24]:
with open('trigram_train_tfidf.pkl','wb') as f:
    pickle.dump(cat_vect_train,f)

with open('trigram_test_tfidf.pkl','wb') as f:
    pickle.dump(cat_vect_test,f)

## 2. Handling continuous features
In EDA we found that certain pairs of continuous varible have higher association among them, so we will drop them and the threshold is 0.9

In [3]:
train_df.drop(['cont9','cont12'], axis=1, inplace=True)
test_df.drop(['cont9','cont12'], axis=1, inplace=True)

## 3. Stacking all features 
I will be stacking differernt sets categorical features with continuous features. And save them for future use in modelling part. My approach to stack features is:
1. Unigram BOW features + Continuous features
2. Bigram BOW features + Continuous features
3. Trigram BOW features + Continuous features
4. Bigram BOW features + Trigram features + Continuous features
5. Unigram TFIDF features + Continuous features
6. Bigram TFIDF features + Continuous features
7. Trigram TFIDF features + Continuous features
8. Bigram TFIDF features + Trigram features + Continuous features

Some of this stacking approach will lead to increase in number of dimensions. So to overcome this, I will be using some dimensionality reduction techniques.

I know that all these sets of stacked features seems overwhelming to work with in modelling part. What can be done, then?
Most of the time we observed that xgboost or some other ensemble method works better than some alone ML algorithms(I am saying this just from Kaggle's challenge perspective). So to decide which set of features is good to proceed further, I will be applying xgboost on all these sets of features and whichever set will give best results I will be proceeding with that.

### 3.1. Unigram BOW features + Continuous features

In [4]:
with open('unigram_train.pkl','rb') as f:
    unigram_train = pickle.load(f)
    
with open('unigram_test.pkl','rb') as f:
    unigram_test = pickle.load(f)

In [5]:
train_x = hstack((unigram_train,train_df.loc[:,'cont1':'cont14']))
test_x = hstack((unigram_test,test_df.loc[:,'cont1':'cont14']))
print('Shape of train_x', train_x.shape)
print('Shape of test_x', test_x.shape)

Shape of train_x (188318, 350)
Shape of test_x (125546, 350)


In [6]:
with open('unigram_bow_train.pkl','wb') as f:
    pickle.dump(train_x,f)
    
with open('unigram_bow_test.pkl','wb') as f:
    pickle.dump(test_x,f)

### 3.2. Bigram BOW features + Continuous features

In [7]:
with open('bigram_train.pkl','rb') as f:
    bigram_train = pickle.load(f)
    
with open('bigram_test.pkl','rb') as f:
    bigram_test = pickle.load(f)

In [8]:
train_x = hstack((bigram_train,train_df.loc[:,'cont1':'cont14']))
test_x = hstack((bigram_test,test_df.loc[:,'cont1':'cont14']))
print('Shape of train_x', train_x.shape)
print('Shape of test_x', test_x.shape)

Shape of train_x (188318, 4373)
Shape of test_x (125546, 4373)


In [9]:
with open('bigram_bow_train.pkl','wb') as f:
    pickle.dump(train_x,f)
    
with open('bigram_bow_test.pkl','wb') as f:
    pickle.dump(test_x,f)

### 3.3. Trigram BOW features + Continuous features

In [10]:
with open('trigram_train.pkl','rb') as f:
    trigram_train = pickle.load(f)
    
with open('trigram_test.pkl','rb') as f:
    trigram_test = pickle.load(f)

In [11]:
train_x = hstack((trigram_train,train_df.loc[:,'cont1':'cont14']))
test_x = hstack((trigram_test,test_df.loc[:,'cont1':'cont14']))
print('Shape of train_x', train_x.shape)
print('Shape of test_x', test_x.shape)

Shape of train_x (188318, 37458)
Shape of test_x (125546, 37458)


In [12]:
with open('trigram_bow_train.pkl','wb') as f:
    pickle.dump(train_x,f)
    
with open('trigram_bow_test.pkl','wb') as f:
    pickle.dump(test_x,f)

### 3.4. Bigram BOW features + Trigram features + Continuous features

In [13]:
train_x = hstack((bigram_train,trigram_train,train_df.loc[:,'cont1':'cont14']))
test_x = hstack((bigram_test,trigram_test,test_df.loc[:,'cont1':'cont14']))
print('Shape of train_x', train_x.shape)
print('Shape of test_x', test_x.shape)

Shape of train_x (188318, 41819)
Shape of test_x (125546, 41819)


In [14]:
with open('bi-trigram_bow_train.pkl','wb') as f:
    pickle.dump(train_x,f)
    
with open('bi-trigram_bow_test.pkl','wb') as f:
    pickle.dump(test_x,f)

### 3.5. Unigram TFIDF features + Continuous features

In [16]:
with open('unigram_train_tfidf.pkl','rb') as f:
    unigram_train = pickle.load(f)
    
with open('unigram_test_tfidf.pkl','rb') as f:
    unigram_test = pickle.load(f)

In [17]:
train_x = hstack((unigram_train,train_df.loc[:,'cont1':'cont14']))
test_x = hstack((unigram_test,test_df.loc[:,'cont1':'cont14']))
print('Shape of train_x', train_x.shape)
print('Shape of test_x', test_x.shape)

Shape of train_x (188318, 350)
Shape of test_x (125546, 350)


In [18]:
with open('unigram_tfidf_train.pkl','wb') as f:
    pickle.dump(train_x,f)
    
with open('unigram_tfidf_test.pkl','wb') as f:
    pickle.dump(test_x,f)

### 3.6. Bigram TFIDF features + Continuous features


In [19]:
with open('bigram_train_tfidf.pkl','rb') as f:
    bigram_train = pickle.load(f)
    
with open('bigram_test_tfidf.pkl','rb') as f:
    bigram_test = pickle.load(f)

In [20]:
train_x = hstack((bigram_train,train_df.loc[:,'cont1':'cont14']))
test_x = hstack((bigram_test,test_df.loc[:,'cont1':'cont14']))
print('Shape of train_x', train_x.shape)
print('Shape of test_x', test_x.shape)

Shape of train_x (188318, 4373)
Shape of test_x (125546, 4373)


In [21]:
with open('bigram_tfidf_train.pkl','wb') as f:
    pickle.dump(train_x,f)
    
with open('bigram_tfidf_test.pkl','wb') as f:
    pickle.dump(test_x,f)

### 3.7. Trigram TFIDF features + Continuous features


In [22]:
with open('trigram_train_tfidf.pkl','rb') as f:
    trigram_train = pickle.load(f)
    
with open('trigram_test_tfidf.pkl','rb') as f:
    trigram_test = pickle.load(f)

In [23]:
train_x = hstack((trigram_train,train_df.loc[:,'cont1':'cont14']))
test_x = hstack((trigram_test,test_df.loc[:,'cont1':'cont14']))
print('Shape of train_x', train_x.shape)
print('Shape of test_x', test_x.shape)

Shape of train_x (188318, 37458)
Shape of test_x (125546, 37458)


In [24]:
with open('trigram_tfidf_train.pkl','wb') as f:
    pickle.dump(train_x,f)
    
with open('trigram_tfidf_test.pkl','wb') as f:
    pickle.dump(test_x,f)

### 3.8. Bigram TFIDF features + Trigram features + Continuous features


In [25]:
train_x = hstack((bigram_train,trigram_train,train_df.loc[:,'cont1':'cont14']))
test_x = hstack((bigram_test,trigram_test,test_df.loc[:,'cont1':'cont14']))
print('Shape of train_x', train_x.shape)
print('Shape of test_x', test_x.shape)

Shape of train_x (188318, 41819)
Shape of test_x (125546, 41819)


In [26]:
with open('bi-trigram_tfidf_train.pkl','wb') as f:
    pickle.dump(train_x,f)
    
with open('bi-trigram_tfidf_test.pkl','wb') as f:
    pickle.dump(test_x,f)