In [65]:
# Packages for data analysis
import pandas as pd
import numpy as np

# Packages for visualizations
import seaborn as sns
import matplotlib.style as style

# Packages for preprocessing
import nltk
import string
import re
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer

# Packages for training models
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Model Evaluation Packages
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.metrics import make_scorer

import matplotlib.pyplot as plt
%matplotlib inline

# Style
sns.set(font_scale=1.5)
style.use('seaborn-pastel')
style.use('seaborn-poster')

warnings.filterwarnings("ignore")


In [33]:
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# Loading of Dataset

In [73]:
# importing the dataset 
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [35]:
print(train['text'].head(7))

0    umgaqo-siseko wenza amalungiselelo kumaziko ax...
1    i-dha iya kuba nobulumko bokubeka umsebenzi na...
2    the province of kwazulu-natal department of tr...
3    o netefatša gore o ba file dilo ka moka tše le...
4    khomishini ya ndinganyiso ya mbeu yo ewa maana...
5    dinyakišišo tše tša go dirwa gabedi ka ngwaga ...
6    kgetse nngwe le nngwe e e sa faposiwang mo tsh...
Name: text, dtype: object


In [61]:
test.head(7)

Unnamed: 0,index,text
0,1,mmasepala fa maemo a a kgethegileng a letlelel...
1,2,uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,tshivhumbeo tshi fana na ngano dza vhathu
3,4,kube inja nelikati betingevakala kutsi titsini...
4,5,winste op buitelandse valuta
5,6,ke feela dilense te hlakilego ta pono e tee go...
6,7,am final gems birthing optionszulutxt


In [37]:
sample_submission.head()

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl


# General Overview of Dataset

In [25]:
train.lang_id.value_counts()

nbl    3000
ven    3000
sot    3000
eng    3000
afr    3000
tso    3000
tsn    3000
ssw    3000
zul    3000
xho    3000
nso    3000
Name: lang_id, dtype: int64

In [28]:
# Taking general overview at both datasets
print('TRAINING DATA')
print('============='+('\n'))
print('Shape of the dataset: {}\n'.format(train.shape))
print('Total Number of unique tweets: {}\n'.format(len(set(train['text']))))
print('Total Number of missing values:\n{}\n\n'.format(train.isnull().sum()))
print('TEST DATA')
print('========='+('\n'))
print('Shape of the dataset: {}\n'.format(test.shape))
print('Total Number of unique tweets: {}\n'.format(len(set(test['text']))))
print('Total Number of missing values:\n{}\n' .format(test.isnull().sum()))


TRAINING DATA

Shape of the dataset: (33000, 2)

Total Number of unique tweets: 29948

Total Number of missing values:
lang_id    0
text       0
dtype: int64


TEST DATA

Shape of the dataset: (5682, 2)

Total Number of unique tweets: 5459

Total Number of missing values:
index    0
text     0
dtype: int64



# Data Preprocessing

In [72]:
def clean_text(text):
    """
    This function uses regular expressions to remove html characters,
    punctuation, numbers and any extra white space from each text
    and then converts them to lowercase.

    Input:
    tweets: original text
            datatype: string

    Output:
    tweets: modified text
            datatype: string
    """
    #will replace the html characters with " "
    text=re.sub('<.*?>', ' ', text)  
    #To remove the punctuations
#    text = text.translate(str.maketrans(' ',' ',string.punctuation))
#    pattern = r'[^a-zA-z0-9\s]' if not False else r'[^a-zA-z\s]'
#    text = re.sub(pattern, '', text)
    # Removal of numbers
#    text = re.sub(r'\d+', ' ', text)
    #will replace newline with space
    text = re.sub("\n"," ",text)
    #will convert to lower case
    text = text.lower()
    # will split and join the words
    text=' '.join(text.split())
    return text

In [74]:
# Application of the function to clean the tweets
train['text'] = train['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)


In [87]:
X = train['text']
y = train['lang_id']

In [85]:
train["text"] = train["text"].str.replace(".txt", " text file")

In [86]:
test["text"] = test["text"].str.replace(".txt", " text file")

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10)

In [None]:
LSVC=Pipeline([('LSVC_tfidf',
                TfidfVectorizer(min_df=2, strip_accents='ascii',
                                smooth_idf=False,
                                ngram_range=(1,2)
                                )
                ),
                ('LSVC',
                 LinearSVC(random_state=42))
               ])

LSVC.fit(X_train,y_train)

ypred_lsvc=LSVC.predict(X_val)

print(classification_report(y_val,ypred_lsvc))


In [None]:
sub_df=pd.DataFrame(df_test['index'])
sub_df['lang_id']=LSVC.predict(df_test['text'])
sub_df.to_csv('Linear_SVC.csv',index=False)

## Multinomial Naive Bayes

In [88]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.02)  #### .15 is the best .20 is better than .25

In [89]:
param_grid = {'alpha':[0.01, 0.1, 1, 5, 10, 20, 50]}
tuned_nb = Pipeline([('tfidf', TfidfVectorizer(min_df=1, 
                                             max_df=0.5, 
                                             ngram_range=(1, 2))),
                     ('nb', GridSearchCV(MultinomialNB(),
                                         param_grid=param_grid,
                                         cv=10,
                                         n_jobs=-1,
                                         scoring='f1_weighted'))
                     ])

tuned_nb.fit(X_train, y_train)

tuned_nb2 = tuned_nb.predict(X_val)



In [90]:
submission_df=pd.DataFrame(test['index'])
submission_df['lang_id']=tuned_nb.predict(test['text'])
submission_df.to_csv('submission_multinomial_nb.csv',index=False)