In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score
import re
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline 

# Import warnings
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [2]:
#Load the Data
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')

In [10]:
train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [11]:
test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [12]:
train.isnull().sum() #checking for null values

lang_id    0
text       0
dtype: int64

In [13]:
train['lang_id'].value_counts() # total number of each obsservation under the train data

tsn    3000
zul    3000
sot    3000
eng    3000
nso    3000
xho    3000
nbl    3000
ven    3000
ssw    3000
afr    3000
tso    3000
Name: lang_id, dtype: int64

In [14]:
train.info() #information about the train data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   lang_id  33000 non-null  object
 1   text     33000 non-null  object
dtypes: object(2)
memory usage: 515.8+ KB


In [3]:
#Base models
names = ['Logistic Regression','Random Forest', 'Nearest Neighbors', 
         'Decision Tree','MultinomialNB', 'XG Boost']

In [12]:
# List of classifiers
classifiers = [
    Pipeline([('tfid', TfidfVectorizer(max_df = 0.4,
                                      smooth_idf = True,
                                      stop_words = 'english',
                                      ngram_range = (1,2))),
             ('clf', LogisticRegression())]),
    Pipeline([('tfid', TfidfVectorizer(max_df = 0.4,
                                      smooth_idf = True,
                                      stop_words = 'english',
                                      ngram_range = (1,2))),
             ('clf', RandomForestClassifier())]),
    Pipeline([('tfid', TfidfVectorizer(max_df = 0.4,
                                      smooth_idf = True,
                                      stop_words = 'english',
                                      ngram_range = (1,2))),
             ('clf', KNeighborsClassifier())]),
    Pipeline([('tfid', TfidfVectorizer(max_df = 0.4,
                                      smooth_idf = True,
                                      stop_words = 'english',
                                      ngram_range = (1,2))),
             ('clf', DecisionTreeClassifier())]),
    Pipeline([('tfid', TfidfVectorizer(max_df = 0.4,
                                      smooth_idf = True,
                                      stop_words = 'english',
                                      ngram_range = (1,2))),
             ('clf', MultinomialNB())]),

        
]

In [9]:
y = train['lang_id']
X = train['text']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 10) #Splitting the datat into training nd testing set

In [13]:
results = []

models = {}
confusion = {}
class_report = {}


for name, clf in zip(names, classifiers):    
    print ('Fitting {:s} model...'.format(name))
    run_time = %timeit -q -o clf.fit(X_train, y_train) #Training the model
    
    print ('... predicting')
    y_pred = clf.predict(X_train)   
    y_pred_test = clf.predict(X_test)
    
    models[name] = clf #storing the trained models in the models dictionary    
    
    results.append([name, run_time.best]) 

    
results = pd.DataFrame(results, columns=['Classifier', 'Train Time'])
results.set_index('Classifier', inplace= True)

Fitting Logistic Regression model...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

... predicting
Fitting Random Forest model...
... predicting
Fitting Nearest Neighbors model...
... predicting
Fitting Decision Tree model...
... predicting
Fitting MultinomialNB model...
... predicting
Fitting Linear SVC model...
... predicting
Fitting XG Boost model...




... predicting


In [16]:
#Logistic Regresion
lr = models['Logistic Regression']
t = test['text']
y_pred_lr = lr.predict(t)
sub = pd.DataFrame( data = {'index': test['index'],
                             'lang_id': y_pred_lr })
sub.to_csv('submission_lr2.csv', index = False)

#Random forest
rf = models['Random Forest']
y_pred_rf = rf.predict(t)
sub = pd.DataFrame( data = {'index': test['index'],
                             'lang_id': y_pred_rf })
sub.to_csv('submission_fr.csv', index = False)

#Nearest Neighbors
nn = models['Nearest Neighbors']
y_pred_nn = nn.predict(t)
sub = pd.DataFrame( data = {'index': test['index'],
                             'lang_id': y_pred_nn })
sub.to_csv('submission_nn.csv', index = False)






# Hyperparameter Tuning

In [35]:
# MultinomialNB using the hyperparameter
multi = Pipeline([('tfid', TfidfVectorizer()),
             ('clf', MultinomialNB(alpha = 0.2))])
multi.fit(X_train, y_train)
t = test['text']
y_pred_m = multi.predict(t)
sub = pd.DataFrame( data = {'index': test['index'],
                             'lang_id': y_pred_m })
sub.to_csv('submission_m2.csv', index = False)

## Random Forest

In [20]:
# Logistic
tfid = TfidfVectorizer()
text = tfid.fit_transform(train['text'])
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(text,y, test_size = 0.2, random_state = 10)
n_estimators = [10, 100, 1000, 2000]
max_depth = [None, 5, 10, 20]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

rf = RandomForestClassifier(random_state=10)

# search the grid
grid = GridSearchCV(estimator=rf, 
                    param_grid=param_grid,
                    cv=2,
                    verbose=2,
                    n_jobs=-1)

grid_result = grid.fit(X_train_h, y_train_h)
grid_result.best_params_

Fitting 2 folds for each of 16 candidates, totalling 32 fits


{'max_depth': None, 'n_estimators': 1000}

In [18]:
rf = Pipeline([('tfid', TfidfVectorizer()),
             ('clf', RandomForestClassifier(n_estimators = 2000, max_depth = None))])
rf.fit(X_train, y_train)
t = test['text']
y_pred_m = rf.predict(t)
sub = pd.DataFrame( data = {'index': test['index'],
                             'lang_id': y_pred_m })
sub.to_csv('submission_rf2.csv', index = False)