In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as ms


In [2]:
data = pd.read_csv('../input/consumer-complaint-classification/Consumer_Complaints.csv')
data.head()

In [3]:
data.shape

In [4]:
print(data.isnull().sum())
ms.matrix(data)

In [5]:
data['Sub-product'].mode()

In [6]:
#i am going to drop unneccessory columns because they dont have any importance 

In [7]:
# we are going to work only with product and consumer complaint because thats what we needed
new_data = data[['Product','Consumer Complaint']]
new_data.head()

In [8]:
new_data.Product.value_counts()

In [9]:
new_data = new_data[pd.notnull(new_data['Consumer Complaint'])]
new_data.shape

In [10]:
new_data['category_id'] = new_data['Consumer Complaint'].factorize()[0]
new_data.head()

In [11]:
cat_id = new_data[['Product', 'category_id']].drop_duplicates().sort_values('category_id')

In [12]:
cat_to_id = dict((cat_id.values))
id_to_cat = dict(cat_id[['category_id','Product']].values)
new_data.head()
#new_data.shape

In [13]:
fig =plt.figure(figsize =(10,8))

new_data.groupby('Product')['Consumer Complaint'].count().plot.bar(ylim=0)


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer(sublinear_tf =True, min_df = 5,norm ='l2' , ngram_range =(1,2), stop_words ='english')
features = tfid.fit_transform(new_data['Consumer Complaint']).toarray()

labels = new_data.category_id


In [15]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(new_data['Consumer Complaint'], new_data['Product'],random_state =0)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()


In [17]:
from sklearn.feature_extraction.text import TfidfTransformer
count_train  =count.fit_transform(x_train)
count_test = count.transform(x_test)

tfid_transform = TfidfTransformer()
tdif_train = tfid_transform.fit_transform(count_train)
tdif_test = tfid_transform.fit_transform(count_test)

In [18]:
from sklearn.svm import LinearSVC

clf = LinearSVC().fit(tdif_train,y_train)
predict = clf.predict(tdif_test)


In [19]:
from sklearn import metrics
print(metrics.classification_report(y_test,predict))

In [20]:
from sklearn.pipeline import Pipeline 

text_clf = Pipeline([('tf' ,TfidfVectorizer(sublinear_tf =True, min_df = 5,norm ='l2' , ngram_range =(1,2), stop_words ='english')),
                     ('clf',LinearSVC())])

In [21]:
text_clf.fit(x_train,y_train)

In [22]:
predict_text = text_clf.predict(x_test)

In [23]:
from sklearn.model_selection import cross_val_score

cross_val_score = cross_val_score(text_clf , x_train,y_train,cv=5,scoring ='accuracy',n_jobs =-1)

In [24]:
print(np.mean(cross_val_score))

In [25]:
from sklearn.naive_bayes import BernoulliNB
nb =BernoulliNB


text_clf = Pipeline([('tf' ,TfidfVectorizer(sublinear_tf =True, min_df = 5,norm ='l2' , ngram_range =(1,2), stop_words ='english')),
                     ('clf',BernoulliNB())])

In [26]:
text_clf.fit(x_train, y_train)

In [27]:
predict = text_clf.predict(x_test)
print(metrics.classification_report(y_test,predict))

In [28]:
from sklearn.model_selection import cross_val_score
cross_val = cross_val_score(text_clf, x_train, y_train, cv=5, scoring ="accuracy", n_jobs = -1)
print(np.mean(cross_val))

In [30]:
from sklearn.tree import DecisionTreeClassifier
text_clf = Pipeline([('tf' ,TfidfVectorizer(sublinear_tf =True, min_df = 5,norm ='l2' , ngram_range =(1,2), stop_words ='english')),
                     ('clf',DecisionTreeClassifier())])
text_clf.fit(x_train,y_train)


In [44]:
predict = text_clf.predict(x_test)

model_score = text_clf.score(x_test,y_test)
print('model score',model_score)

cross_val = cross_val_score(text_clf, x_train,y_train, cv =5 ,scoring ='accuracy',n_jobs =-1)
print('cross val score ',np.mean(cross_val))

In [46]:
from sklearn.ensemble import RandomForestClassifier

text_clf = Pipeline([('tf' ,TfidfVectorizer(sublinear_tf =True, min_df = 5,norm ='l2' , ngram_range =(1,2), stop_words ='english')),
                     ('clf',RandomForestClassifier())])
text_clf.fit(x_train,y_train)



In [47]:
predict = text_clf.predict(x_test)

model_score = text_clf.score(x_test,y_test)
print('model score',model_score)

cross_val = cross_val_score(text_clf, x_train,y_train, cv =5 ,scoring ='accuracy',n_jobs =-1)
print('cross val score ',np.mean(cross_val))

In [50]:
from lightgbm import LGBMClassifier 

text_clf = Pipeline([('tf' ,TfidfVectorizer(sublinear_tf =True, min_df = 5,norm ='l2' , ngram_range =(1,2), stop_words ='english')),
                     ('clf',RandomForestClassifier())])
text_clf.fit(x_train,y_train)

In [51]:
predict = text_clf.predict(x_test)
print(metrics.classification_report(y_test,predict))

In [54]:
predict = text_clf.predict(x_test)

model_score = text_clf.score(x_test,y_test)
print('model score',model_score)

cross_val = cross_val_score(text_clf, x_train,y_train, cv =5 ,scoring ='accuracy',n_jobs =-1)
print('cross val score ',np.mean(cross_val))

hyperparameter tuning

In [55]:
from sklearn.model_selection import RandomizedSearchCV ,GridSearchCV

n_estimators = [int(x) for x in np.linspace(100,1200,12)]
max_features = ['auto','sqrt']
max_depth = [int(x) for x in np.linspace(5,30,6) ]
min_samples_split =[2,5,10,15,100]
min_samples_leaf = [1,2,5,10]


search_grid ={'n_estimators': n_estimators, 'max_features': max_features ,
             'max_depth': max_depth, 'min_samples_split': min_samples_split,
             'min_samples_leaf' : min_samples_leaf}


In [58]:
#random forest 
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf =RandomizedSearchCV(estimator = rf, param_distributions = search_grid, cv =5, scoring ='accuracy',n_iter = 10,n_jobs =-1)

rf.fit(tdif_train,y_train)

In [61]:
rf.best_score_

In [60]:
rf.best_params_