In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import pandas as pd
import pickle
from itertools import chain
import warnings
warnings.simplefilter("ignore")

# plot
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.graph_objects as go

# text preprocessing
import re
import nltk
# uncomment if not not downloaded
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('brown')
#nltk.download('names')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('universal_tagset')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
#from normalise import normalise


# feature Engineering and feature Selection
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel,SelectKBest,chi2,mutual_info_classif

# ML model
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer
from sklearn.preprocessing import StandardScaler,LabelEncoder,OrdinalEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, cross_val_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/hackerearth-ml-solving-the-citizens-grievances/dataset/train.csv")
test = pd.read_csv("/kaggle/input/hackerearth-ml-solving-the-citizens-grievances/dataset/test.csv")
test['importance']=-1
train['label'] = 'train'
test['label'] = 'test'
combined = pd.concat([train,test],axis=0)
combined.shape

Feature Engineering :
Part-1

1) Intiution says that a severe crime translates to multiple violations. How do we measure multiple violations ? By counting the total number of issues a case is registered under. This is done by combine_and_count_issues function


2) Again, we seek to establish whether No of respondents is related to severity of a grievance.
H0 : There is no relationship between number of respondents

H1 : There is a relationship
</br>
encode_and_count_respondents is the function for it


3) Many Columns do not have much variance, and therefore do not contribute to the output . These were removed Functions : drop_cols , remove_constant_cols


4) Date Columns were compared to calculated the number of days elapsed between milestones Functions: generate_date_features


5) Item-Id indicates an unique id for a case. However, the numbers were in increasing order. So, this could indicate the recency of a case. This column is label-encoded to give it a relationship such that a particular Case came before the others The other columns were one-hot encoded, as they can't be compared on a nominal scale. Functions: col_encode


6) A simple function to impute the missing values with 0. This is because most of the features are kind of dummy categorical variables, so no need to impute with mean or median


7) A function to count the total number of articles or paragraphs referenced per case Function: calculate_articles_paragraph

In [None]:
# Step 1: Create Features from the Given Data
# In the second step, we will vectorize the data

def combine_and_count_issues(df):
    print('Combining and Counting Total Issues')
    issue_columns = [x for x in list(df.columns) if 'issue' in x]
    df['issues'] = df[issue_columns].apply(lambda x: '. '.join([val for val in x if pd.notna(val)]), axis=1)
    df['total_issues'] = df[issue_columns].apply(lambda x:  sum([int(pd.notna(i)) for i in x]),axis=1) 
    df.drop(issue_columns, axis=1, inplace=True)
    df[['issues']].fillna('',inplace=True)
    return df

# This function encodes the respondents and counts the total respondents
def encode_and_count_respondents(df):
    print('Encoding and Counting Respondents')
    dict1 = dict([(c,[b]) for b,c in zip( df['respondentOrderEng'],df['country.name'])])
    dict2 = dict([(c,[b]) for b,c in zip( combined['respondentOrderEng'],combined['respondent.0'])]) 
    dict1.update(dict2)
    df['respondent.0'] = df['respondent.0'].apply(lambda x: dict1[x][0])
    df['respondent.1'] = df['respondent.1'].apply(lambda x: dict1[x][0] if pd.notnull(x) else x)
    df['respondent.2'] = df['respondent.2'].apply(lambda x: dict1[x][0] if pd.notnull(x) else x)
    df['respondent.3'] = df['respondent.3'].apply(lambda x: dict1[x][0] if pd.notnull(x) else x)
    df['respondent.4'] = df['respondent.4'].apply(lambda x: dict1[x][0] if pd.notnull(x) else x)
    respondent_cols = [col for col in list(df.columns) if'respondent.' in col]
    df['total_respondents'] = df[respondent_cols].apply(lambda x:  sum([int(pd.notna(i)) for i in x]),axis=1) 
    #df.drop(respondent_cols, axis=1, inplace=True)
    df.drop('respondentOrderEng', axis=1, inplace=True)
    return df

# Remove cols not important for Modelling
def drop_cols(df):
    cols =['parties.0','parties.2', 'country.alpha2', 'parties.1', \
           'country.name', 'docname', 'appno', 'ecli', 'kpdate','sharepointid','originatingbody_name']
    df.drop(cols, axis=1, inplace=True)
    return df

# This function removes features where there is no variance
def remove_constant_cols(df):
    print('Remove Constant Columns')
    for col in df.columns:
        if df[col].nunique()==1:
            print(col,end=', ' )
            del df[col]
    return df

# This function generates new date features
def generate_date_features(df):
    print('generate date features')
    df['days_between_intro_decision'] = (pd.to_datetime(df['decisiondate']) - pd.to_datetime(df['introductiondate'])).dt.days
    df['days_between_intro_judgement'] = (pd.to_datetime(df['judgementdate']) - pd.to_datetime(df['introductiondate'])).dt.days
    df['days_between_decision_judgement'] = (pd.to_datetime(df['judgementdate']) - pd.to_datetime(df['decisiondate'])).dt.days
    df.drop(['decisiondate','introductiondate','judgementdate'], axis=1, inplace=True)
    return df

# Encoding for few more feature columns
def col_encode(df):
    print('One-hot encoding Relevcant Rows')
    le = LabelEncoder()
    # Item Ids are in ascending order of judgement date..So let's convert it using label encoder
    df['itemid'] = df['itemid'].apply(lambda x: x[4:7])  
    pd.get_dummies(df,columns=['doctypebranch'])
    df.drop(['doctypebranch'], axis=1, inplace=True )
    df['separateopinion'] = le.fit_transform(df['separateopinion'])   
    return df

# Fill Any missing values with 0 except for issues
def fill_missing(df):
    print('Replace NA values in Numerical Columns with 0')
    for col in df.columns:
        if col not in ['issues','label']:
            df[col].fillna(0,inplace=True)
            df[col] = df[col].astype('int')
    return df

def calculate_articles_paragraph(df):
    articles_columns = [x for x in list(df.columns) if 'article' in x and '_article' not in x]
    df['total_articles'] = df[articles_columns].apply(lambda x:  sum([i for i in x]),axis=1)
    paragraph_columns = [x for x in list(df.columns) if 'paragraphs' in x]
    df['total_paragraphs'] = df[paragraph_columns].apply(lambda x:  sum([i for i in x]),axis=1)
    return df

In [None]:
combined = combine_and_count_issues(combined)
combined = encode_and_count_respondents(combined)
combined = drop_cols(combined)
combined = remove_constant_cols(combined)
combined = generate_date_features(combined)
combined = col_encode(combined)
combined = fill_missing(combined)
combined = calculate_articles_paragraph(combined)

Feature Engineering
Part 2 : Text Features

1) Lemmatize the 'issues' column

2) Generate Hand Crafted Features : Calculate the number of times a POS ( Parts of Speech) occured.
Calculate the number of times a sequence of Digits occured. This would indicate the number of articles referenced
3) Create a special Column for Criminal grievances


Once basic text pre-processing is done, generate features from the text Column using TF-IDF with ngrams between 1 and 5

In [None]:
# Text Processing

def tokenize(text):
    return nltk.word_tokenize(text)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def words_lemmatizer(text, encoding="utf8"):
    words = nltk.word_tokenize(text)
    lemma_words = []
    wl = WordNetLemmatizer()
    for word in words:
        pos = find_pos(word)
        lemma_words.append(wl.lemmatize(word, pos))
    return " ".join(lemma_words)

def stemmer(text):
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in nltk.word_tokenize(text)]
    return " ".join(words)


# Function to find part of speech tag for a word
def find_pos(word):
    # Part of Speech constants
    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    pos = nltk.pos_tag(nltk.word_tokenize(word))[0][1]
    
    # Adjective tags - 'JJ', 'JJR', 'JJS'
    if pos.lower()[0] == 'j':
        return 'a'
    # Adverb tags - 'RB', 'RBR', 'RBS'
    elif pos.lower()[0] == 'r':
        return 'r'
    # Verb tags - 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'
    elif pos.lower()[0] == 'v':
        return 'v'
    # Noun tags - 'NN', 'NNS', 'NNP', 'NNPS'
    else:
        return 'n'

def remove_stopwords(text, lang='english'):
    """
    :param text: text input
    :return: text with stopwords removed
    :rtype: str 
    """
    words = nltk.word_tokenize(text)
    lang_stopwords= [i for i in stopwords.words(lang) if i not in ['not', 'no']]
    stopwords_removed= [w for w in words if w not in lang_stopwords]
    return " ".join(stopwords_removed)

In [None]:
def do_preprocessing(df):
    """
    create handcrafted features
    """
    df['Issue_cleaned'] = df['issues'].str.lower().\
                                str.replace('[^\w\s]|_', ' ').\
                                apply(remove_numbers).\
                                apply(remove_stopwords)
    df['Issues_cleaned_lemma'] = df['Issue_cleaned'].apply(words_lemmatizer)
    df['Issues_cleaned_stem'] = df['Issue_cleaned'].apply(stemmer)
    # hand-crafted features
    df['words'] = df['issues'].apply(tokenize)
    df['text__len'] = df['words'].apply(len)
    df['sent__num'] = df['issues'].apply(lambda x: len(nltk.sent_tokenize(x)))
    df['digit__cnt'] = df['words'].apply(lambda x: sum([re.search('\d', i) is not None for i in x]))
    df['bracket__cnt'] = df['words'].apply(lambda x: sum([re.search('\(|\)|\[|\]', i) is not None for i in x]))
    df['equal__cnt'] = df['issues'].apply(lambda x: len(re.findall('=|<-', x)))
    df['verb__cnt'] = df['issues'].apply(lambda x: sum([tag[0]=='V' for token, tag in TextBlob(x).tags]))
    df['noun__cnt'] = df['issues'].apply(lambda x: sum([tag[0]=='N' for token, tag in TextBlob(x).tags]))
    df['adv__cnt'] = df['issues'].apply(lambda x: sum([tag[0]=='R' for token, tag in TextBlob(x).tags]))
    df['adj__cnt'] = df['issues'].apply(lambda x: sum([tag[0]=='J' for token, tag in TextBlob(x).tags]))
    df['criminal__cnt'] = df['issues'].apply(lambda x: len(re.findall('criminal|crime', x)))
    #df['keywrds__weight'] = df['Comment_cleaned_lemma'].apply(lambda x: compute_weight(x))    
    df['nonStop__cnt'] = df['words'].apply(lambda x: len([i for i in x if i not in stopwords.words('english')]))
    df['continuousChar__cnt'] = df['issues'].apply(lambda x: len([match.group() for match in re.compile(r'([a-z])\1{2,}').finditer(x)]))
    df['continuousDigit__cnt'] = df['issues'].apply(lambda x: len([match.group() for match in re.compile(r'([0-9])\1{2,}').finditer(x)]))
    df['continuousPunct__cnt'] = df['issues'].apply(lambda x: len([match.group() for match in re.compile(r'([\W|_])\1{2,}').finditer(x)]))
    return df

Feature Selection :

For doing a feature selection, We have to split the dataset firt. Due to presence of multiple labels in the output column, We do a stratify sampling to generate the train and test sets.


Feature Selection is done using a mutual_info_classifier, as it was found to be empirically better than chi-square test

In [None]:
combined_train = combined.query('label == "train"').drop(['label'] , axis=1)
combined_train = do_preprocessing(combined_train)

In [None]:
# Train-Test Split in 80:20 Ratio
X_train, X_test, Y_train, Y_test = train_test_split(combined_train.drop('importance', axis=1),combined_train['importance'],test_size=0.2,stratify=combined_train['importance'])

In [None]:
# Vectorize the issue column and add the generated features to the X_train matrix
vect = TfidfVectorizer(stop_words='english',ngram_range=(1,3),#token_pattern=r'b[^\d\W]+\b',
                       min_df=5,binary=True)
X_train_dtm = vect.fit_transform(X_train['Issues_cleaned_lemma'])
df1 = pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())

In [None]:
# Merge X_train with features matrix
X_train.drop(['Issues_cleaned_lemma','issues','Issue_cleaned','Issues_cleaned_stem','words'],axis = 1, inplace = True)
X_train.reset_index(drop=True, inplace = True)
res = pd.concat([X_train, df1], axis=1)

In [None]:
# Prepare X_test Matrix
X_test_dtm =  vect.transform(X_test['Issues_cleaned_lemma'])
df_test = pd.DataFrame(X_test_dtm.toarray(), columns=vect.get_feature_names())
X_test.drop(['Issues_cleaned_lemma','issues','Issue_cleaned','Issues_cleaned_stem','words'],axis = 1, inplace = True)
X_test.reset_index(drop=True, inplace = True)
res_test = pd.concat([X_test, df_test], axis=1)

In [None]:
# feature selection
def select_features(X_train, y_train, X_test,k):
    fs = SelectKBest(score_func=mutual_info_classif, k=k)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs


**Model Building : xgboost, LGBM, Catboost, Voting Classifier**

LGBM Training and Prediction

In [None]:
combined_test = combined.query('label == "test"')
combined_test = do_preprocessing(combined_test)
test = combined_test.drop(['issues','label','Issue_cleaned','Issues_cleaned_stem','words', 'importance'] , axis=1)

In [None]:
X_test_dtm_s=vect.transform(test['Issues_cleaned_lemma'])
df2 = pd.DataFrame(X_test_dtm_s.toarray(), columns=vect.get_feature_names())
del test['Issues_cleaned_lemma']
test.reset_index(drop = True,inplace = True)
res2 = pd.concat([test, df2], axis=1)

XGBOOST

In [None]:
# With 200 features
X_train_fs,X_test_fs = select_features(res, Y_train, res_test,200)
XG200 = XGBClassifier()
xg_train = cross_val_predict(XG200, X_train_fs, Y_train, cv=5, 
                                  n_jobs=-1, method="predict")
print("cv score: Train ", accuracy_score(xg_train, Y_train) * 100)

XG200.fit(X_train_fs,Y_train)
xg_test = XG200.predict(X_test_fs)

print("cv score: Train ", accuracy_score(xg_test, Y_test) * 100)

In [None]:
# With 150 features
X_train_fs,X_test_fs = select_features(res, Y_train, res_test,150)
XG = XGBClassifier()
xg_train = cross_val_predict(XG, X_train_fs, Y_train, cv=5, 
                                  n_jobs=-1, method="predict")
print("cv score: Train ", accuracy_score(xg_train, Y_train) * 100)

XG.fit(X_train_fs,Y_train)
xg_test = XG.predict(X_test_fs)

print("cv score: Test ", accuracy_score(xg_test, Y_test) * 100)

In [None]:
X_train_fs,X_test_fs = select_features(res, Y_train, res2,150)
xg_test_set = XG.predict(X_test_fs)

test = pd.read_csv("/kaggle/input/hackerearth-ml-solving-the-citizens-grievances/dataset/test.csv")
sub = pd.DataFrame(columns=["appno","importance"])
sub["appno"] = test.appno
sub["importance"] = xg_test_set
sub.to_csv("result_tunedxg150.csv", index=False)

In [None]:
X_train_fs,X_test_fs = select_features(res, Y_train, res2,200)
xg_test_set = XG200.predict(X_test_fs)

sub = pd.DataFrame(columns=["appno","importance"])
sub["appno"] = test.appno
sub["importance"] = xg_test_set
sub.to_csv("result_tunedxg200.csv", index=False)

Catboost Modeling and Prediction

In [None]:
from catboost import CatBoostClassifier

# With 150 features
X_train_fs,X_test_fs = select_features(res, Y_train, res_test,150)

cat150 = CatBoostClassifier(random_state=40,n_estimators=1005)
cat_pred = cross_val_predict(cat150, X_train_fs, Y_train, cv=5, 
                                  n_jobs=-1, method="predict")
print("cv score: ", accuracy_score(cat_pred, Y_train) * 100)

cat150.fit(X_train_fs,Y_train)

In [None]:
cat_test = cat150.predict(X_test_fs)

print("cv score: Test ", accuracy_score(cat_test, Y_test) * 100)

In [None]:
# With 200 features
X_train_fs,X_test_fs = select_features(res, Y_train, res_test,200)

cat200 = CatBoostClassifier(random_state=40,n_estimators=1005)
cat_pred = cross_val_predict(cat200, X_train_fs, Y_train, cv=5, 
                                  n_jobs=-1, method="predict")
print("cv score: ", accuracy_score(cat_pred, Y_train) * 100)

cat200.fit(X_train_fs,Y_train)
cat_test = cat200.predict(X_test_fs)

print("cv score: Test ", accuracy_score(cat_test, Y_test) * 100)

In [None]:
X_train_fs,X_test_fs = select_features(res, Y_train, res2,150)
cat_test = cat150.predict(X_test_fs)

sub["appno"] = test.appno
sub["importance"] = cat_test
sub.to_csv("result_cat150.csv", index=False)

LGBM

In [None]:
# With 200 Features
X_train_fs,X_test_fs = select_features(res, Y_train, res_test,200)


lg200 = LGBMClassifier(learning_rate=0.01,max_depth=7,n_estimators=1000)


lg200_pred = cross_val_predict(lg200, X_train_fs, Y_train, cv=5, 
                                  n_jobs=-1, method="predict")
print("cv score: ", accuracy_score(lg200_pred, Y_train) * 100)

lg200.fit(X_train_fs,Y_train)
lg200_test = lg200.predict(X_test_fs)

print("cv score: Test ", accuracy_score(lg200_test, Y_test) * 100)

In [None]:
# With 150 Features
X_train_fs,X_test_fs = select_features(res, Y_train, res_test,150)


lg150 = LGBMClassifier(learning_rate=0.01,max_depth=7,n_estimators=1000)


lg150_pred = cross_val_predict(lg150, X_train_fs, Y_train, cv=5, 
                                  n_jobs=-1, method="predict")
print("cv score: ", accuracy_score(lg150_pred, Y_train) * 100)

lg150.fit(X_train_fs,Y_train)
lg150_test = lg150.predict(X_test_fs)

print("cv score: Test ", accuracy_score(lg150_test, Y_test) * 100)

Voting Classifier

In [None]:
from sklearn.ensemble import  VotingClassifier
eclf1 = VotingClassifier(estimators=[('lgb', lg150), ('xg', XG), ('cat', cat150)], voting='soft')
X_train_fs,X_test_fs = select_features(res, Y_train, res_test,150)
eclf1 = eclf1.fit(X_train_fs, Y_train)
vpred = eclf1.predict(X_test_fs)

print("cv score: Test ", accuracy_score(vpred, Y_test) * 100)

Voting Classifier gives the highest accuracy