In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/hackerearth-ml-solving-the-citizens-grievances/dataset/train.csv')
train

In [None]:
train['sharepointid'].apply(lambda x: str(x)[:3])
train['sharepointid']

***Dropping unwanted features***

In [None]:
train = train.drop(['decisiondate','application','docname','ecli','introductiondate','country.name','itemid','languageisocode','originatingbody_type','originatingbody_name','sharepointid','documentcollectionid=CASELAW' ,'documentcollectionid=JUDGMENTS','documentcollectionid=CHAMBER','documentcollectionid=ENG','documentcollectionid=COMMITTEE','documentcollectionid=GRANDCHAMBER','typedescription'],axis = 1)

In [None]:
test = pd.read_csv('../input/hackerearth-ml-solving-the-citizens-grievances/dataset/test.csv')

In [None]:
test = test.drop(['decisiondate','application','docname','ecli','introductiondate','country.name','itemid','languageisocode','originatingbody_type','originatingbody_name','sharepointid','documentcollectionid=CASELAW' ,'documentcollectionid=JUDGMENTS','documentcollectionid=CHAMBER','documentcollectionid=ENG','documentcollectionid=COMMITTEE','documentcollectionid=GRANDCHAMBER','typedescription'],axis = 1)

In [None]:
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 
train['country.alpha2']= label_encoder.fit_transform(train['country.alpha2']) 
test['country.alpha2']= label_encoder.transform(test['country.alpha2']) 

***Encoding some feature values***

In [None]:
db=pd.get_dummies(train['doctypebranch'],drop_first=True)
db.head()

In [None]:
db=pd.get_dummies(test['doctypebranch'],drop_first=True)
db.head()

In [None]:
train=pd.concat([train,db],axis=1)

In [None]:
test=pd.concat([test,db],axis=1)

In [None]:
train = train.drop(['doctypebranch'],axis=1)
test = test.drop(['doctypebranch'],axis=1)

In [None]:
cols = [c for c in train.columns if c.lower()[:5] != 'issue']

In [None]:
train = train[cols]

***Dropping issue , parties , respondent columns***

In [None]:
col = [c for c in test.columns if c.lower()[:5] != 'issue']
test = test[col]

In [None]:
cols = [c for c in train.columns if c.lower()[:7] != 'parties']
train = train[cols]

In [None]:
col = [c for c in test.columns if c.lower()[:7] != 'parties']
test = test[col]

In [None]:
cols = [c for c in train.columns if c.lower()[:10] != 'respondent']
train = train[cols]

In [None]:
col = [c for c in test.columns if c.lower()[:10] != 'respondent']
test = test[col]

***Splitting judgementdate feature into 3 features of day,month,year***

In [None]:
train['judgementdate'] = pd.to_datetime(train['judgementdate'])
train = train.assign(
               day=train.judgementdate.dt.day,
               month=train.judgementdate.dt.month,
               year=train.judgementdate.dt.year)
train.head()
train = train.drop(['judgementdate'],axis = 1)

In [None]:
test['judgementdate'] = pd.to_datetime(test['judgementdate'])
test = test.assign(
               day=test.judgementdate.dt.day,
               month=test.judgementdate.dt.month,
               year=test.judgementdate.dt.year)
test.head()
test = test.drop(['judgementdate'],axis = 1)

In [None]:
train['kpdate'] = pd.to_datetime(train['kpdate'])
train = train.assign(
               day_k=train.kpdate.dt.day,
               month_k=train.kpdate.dt.month,
               year_k=train.kpdate.dt.year)
train.head()
train = train.drop(['kpdate'],axis = 1)

In [None]:
test['kpdate'] = pd.to_datetime(test['kpdate'])
test = test.assign(
               day_k=test.kpdate.dt.day,
               month_k=test.kpdate.dt.month,
               year_k=test.kpdate.dt.year)
test.head()
test = test.drop(['kpdate'],axis = 1)

In [None]:
so=pd.get_dummies(train['separateopinion'],drop_first=True)
so.head()

In [None]:
train = train.drop(['separateopinion'],axis=1)

In [None]:
train=pd.concat([train,so],axis=1)

In [None]:
so=pd.get_dummies(test['separateopinion'],drop_first=True)

In [None]:
test = test.drop(['separateopinion'],axis=1)

In [None]:
test=pd.concat([test,so],axis=1)

In [None]:
train

In [None]:
train_y = train['importance']

In [None]:
train_x = train.drop(['appno' ,'importance' ],axis=1)

***HyperParameter Tunning***

Most of the time in an unbiased dataset, xgboost works well.
You can try SMOTEboost or Catboost techniques also.


In [None]:
params={
 "learning_rate"    : [ 0.10, 0.15, 0.20] ,
 "max_depth"        : [ 5, 6, 7 ,8],
 "min_child_weight" : [ 1, 3, 5 ],
 "gamma"            : [ 0.0, 0.1, 0.2],
 "colsample_bytree" : [ 0.3, 0.4, 0.5],  
}

In [None]:
## Hyperparameter optimization using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost
from sklearn.metrics import precision_score,recall_score,accuracy_score,f1_score,roc_auc_score

In [None]:
classifier=xgboost.XGBClassifier(n_estimators = 1000,nthread=1,objective = 'binary:logistic')

In [None]:
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring='accuracy',n_jobs=4,cv=5,verbose=3)

In [None]:
random_search.fit(train_x, train_y)

In [None]:
random_search.best_params_

In [None]:
idno = test['appno']

In [None]:
test = test.drop(['appno'],axis=1)

In [None]:
pred = random_search.predict(test)

In [None]:
pred

In [None]:
df = pd.DataFrame()

In [None]:
df['appno'] = idno

In [None]:
df['importance'] = pred

In [None]:
df

In [None]:
df.to_csv('file.csv',index=False) 

***If you like the notebook please drop a like***