In [1]:
# Packages for data analysis
import pandas as pd
import numpy as np
from time import time

# Packages for visualizations
import seaborn as sns
import matplotlib.style as style
from wordcloud import WordCloud

# Package for model storage
import pickle

import googletrans
from googletrans import Translator

# Packages for preprocessing
import nltk
import string
import re
import contractions
import warnings
import emoji
import itertools
import spacy

from textblob import TextBlob
from langdetect import detect
from PIL import Image
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from advertools.emoji import extract_emoji

# Packages for training models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from imblearn.over_sampling import SMOTE

# Model Evaluation Packages
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.metrics import make_scorer

import matplotlib.pyplot as plt
%matplotlib inline

# Style
sns.set(font_scale=1.5)
style.use('seaborn-pastel')
style.use('seaborn-poster')

warnings.filterwarnings("ignore")


  import pandas.util.testing as tm


In [30]:
nlp = spacy.load('en_core_web_lg')

In [31]:
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# Loading of Dataset

In [5]:
# importing the dataset 
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [19]:
print(train['text'].head(7))

0    umgaqo-siseko wenza amalungiselelo kumaziko ax...
1    i-dha iya kuba nobulumko bokubeka umsebenzi na...
2    the province of kwazulu-natal department of tr...
3    o netefatša gore o ba file dilo ka moka tše le...
4    khomishini ya ndinganyiso ya mbeu yo ewa maana...
5    dinyakišišo tše tša go dirwa gabedi ka ngwaga ...
6    kgetse nngwe le nngwe e e sa faposiwang mo tsh...
Name: text, dtype: object


In [16]:
test.head(7)

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.
5,6,"Ke feela dilense tše hlakilego, tša pono e tee..."
6,7,<fn>(762010101403 AM) 1495 Final Gems Birthing...


In [20]:
sample_submission.head()

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl


# General Overview of Dataset

In [25]:
train.lang_id.value_counts()

nbl    3000
ven    3000
sot    3000
eng    3000
afr    3000
tso    3000
tsn    3000
ssw    3000
zul    3000
xho    3000
nso    3000
Name: lang_id, dtype: int64

In [28]:
# Taking general overview at both datasets
print('TRAINING DATA')
print('============='+('\n'))
print('Shape of the dataset: {}\n'.format(train.shape))
print('Total Number of unique tweets: {}\n'.format(len(set(train['text']))))
print('Total Number of missing values:\n{}\n\n'.format(train.isnull().sum()))
print('TEST DATA')
print('========='+('\n'))
print('Shape of the dataset: {}\n'.format(test.shape))
print('Total Number of unique tweets: {}\n'.format(len(set(test['text']))))
print('Total Number of missing values:\n{}\n' .format(test.isnull().sum()))


TRAINING DATA

Shape of the dataset: (33000, 2)

Total Number of unique tweets: 29948

Total Number of missing values:
lang_id    0
text       0
dtype: int64


TEST DATA

Shape of the dataset: (5682, 2)

Total Number of unique tweets: 5459

Total Number of missing values:
index    0
text     0
dtype: int64



In [None]:
def clean_text(text):
    """
    This function uses regular expressions to remove html characters,
    punctuation, numbers and any extra white space from each text
    and then converts them to lowercase.

    Input:
    text: original text
          datatype: string

    Output:
    texts: modified text
           datatype: string
    """
    # replace the html characters with " "
    text=re.sub('<.*?>', ' ', text)
    # To remove the punctuations
#    text = text.translate(str.maketrans(' ',' ',string.punctuation))
#    pattern = r'[^a-zA-z0-9\s]' if not False else r'[^a-zA-z\s]'
#    text = re.sub(pattern, '', text)
#     Removal of numbers
#    text = re.sub(r'\d+', ' ', text)
    # will replace newline with space
    text = re.sub("\n"," ",text)
    # will convert to lower case
    text = text.lower()
    # will split and join the words
    text=' '.join(text.split())
    return text
