## Importing Libraries

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from random import shuffle
import pickle

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt 
import seaborn as sns 


# Preprocessing
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score



# Models
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

## Main Functions

In [2]:
# remove some special characters
def remove_special_chars(sen, filter_chars):
    sen = sen.strip()
    sen = sen.lower()
    for each in sen:
        num_ascii = ord(each)
        # delete number, ".", "\", all chars in filter_chars
        if (num_ascii > 47 and num_ascii < 58) or num_ascii == 92 or num_ascii == 46 or (each in filter_chars):
            sen = sen.replace(each, "")
    return sen

# read file csv and convert it to pandasframe
def open_file(name):
    """
    
    """
    with open('{file_name}.csv'.format(file_name = "formatted_data"), encoding='Latin1') as f:
        content = f.readlines()
    # you may also want to remove whitespace characters like `\n` at the end of each line
    content = [x.strip() for x in content] # mỗi

    data = []
    for num, each in enumerate(content):
        each = each.split(";")

        if "." in each[1]:
            sentences = each[1].split(".") 
            filter_chars = ['\t', '!', '"', '%', '&', '*', '+', ',', '-', '/', ':', '=', '?', '@', '[', ']', '§', 
                            '«', "”", "\\", ".", '»']
                    
            for number, sen in enumerate(sentences):
                """
                insert remove special characters

                """

                # filter no meaning words
                sen = remove_special_chars(sen, filter_chars)

                # make sure a sentence have len(sentence) > 0
                if len(sen)>0:
                    data.append([each[0], sen, each[2]])

        else:
            data.append(each)

    main_data = data[1:]
    main_data = shuffle(main_data)
    df = pd.DataFrame(main_data, columns = data[0])
    return df

# get data in a row
def get_data(df, row = 60000):
    return  df.iloc[row][0], df.iloc[row][1]

# vectorize sentences and split it in to train and test file
def vectorization(df, test_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(list(df["text"]), list(df["language"]), test_size=test_size, random_state=42)

    # vectorize sentence X
    count_vectorizer = CountVectorizer(analyzer='char')
    X_train_features = count_vectorizer.fit_transform(X_train)
    X_test_features = count_vectorizer.transform(X_test)

    # vectorize label Y
    label_encoder = preprocessing.LabelEncoder()
    y_train_features = label_encoder.fit_transform(y_train)
    y_test_features = label_encoder.transform(y_test)
    
    # getted features
    features = count_vectorizer.get_feature_names()
    
    # getted labels
    labels = list(label_encoder.classes_)
    
    return X_train_features, y_train_features, X_test_features, y_test_features, features, labels, count_vectorizer

## Reading the Dataset

In [4]:
df = open_file('formatted_data')
df.shape
label, text = get_data(df,row = 700)
print(label)
print(text)

da
desvã¦rre skete denne udvikling i europa hovedsageligt ved en stigning i produktiviteten og kun i ringe grad ved en stigning i beskã¦ftigelsen


## Training & Testing Dataset

In [5]:
X_train_features, y_train_features, X_test_features, y_test_features, features, labels, count_vectorizer = vectorization(df)

In [6]:
# number of the features
print("features: ", features)
print("\nLen features: ", len(features))

features:  [' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87', '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f', '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f', '\xa0', '¡', '¢', '£', '¤', '¥', '¦', '¨', '©', 'ª', '¬', '\xad', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '¼', '½', '¾', '¿', 'â', 'ã', 'ä', 'å', 'è', 'î', 'ï', 'ð', 'ñ']

Len features:  97


In [7]:
X_train_features.toarray()[0]

array([8, 4, 1, 2, 0, 6, 0, 0, 0, 5, 0, 0, 3, 2, 4, 3, 2, 1, 5, 1, 5, 4,
       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0])

## Model Development

### Naive Bayes

In [8]:
# create and training model MultinomialNB
modelNB = MultinomialNB()
modelNB.fit(X_train_features, y_train_features)

# model accuracy, how often is the classifier correct?
print("Accuracy: ",f1_score(y_test_features, modelNB.predict(X_test_features), average='macro'))

Accuracy:  0.9253365038559969


### Random Forest

In [9]:
# model Random Forest
modelRF=RandomForestClassifier(n_estimators=100)
modelRF.fit(X_train_features,y_train_features)

y_pred = modelRF.predict(X_test_features)

# model accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test_features, y_pred))

Accuracy: 0.9403787103377687


### Finding Optimal Parameters for Random Forest using Grid Search CV

In [None]:
# tuning model by grid search cv
from sklearn.model_selection import GridSearchCV

param_grid = { 
    'n_estimators': [50, 100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=RandomForestClassifier(), n_jobs = -1, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train_features, y_train_features)

# choose the best parametters 
print("Optimal paras: ", CV_rfc.best_params_)

### Using Optimal Parameters

In [11]:
# Create random forest model with the optimal parametter
optimal_modelRF=RandomForestClassifier(n_estimators=300, max_features= 'log2')

#Train the model using the training sets 
optimal_modelRF.fit(X_train_features,y_train_features)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [12]:
# find the accuracy
y_pred = optimal_modelRF.predict(X_test_features)

# model accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test_features, y_pred)) #Accuracy: 0.9437691914022518

Accuracy: 0.9428096212896623


### SVM Model

In [13]:
model_SVM = SVC(kernel='linear')
model_SVM.fit(X_train_features, y_train_features)

y_pred = model_SVM.predict(X_test_features)

print(confusion_matrix(y_test_features,y_pred))
print(classification_report(y_test_features,y_pred))

[[ 669    3    0    8    0    0    2    0    0    0    0    0    0    0
     0    2    0   12    0    0    0]
 [   6  515    0    2    0    0    3    0    1    1    0    0    1    1
     0    0    0    3   14    2    0]
 [   1    0 1019    5    0    2    0    0    0    2    0    2    3   13
     4    0    0    1    0    0    6]
 [   0    0   15  849    0    3    0    2    0    0    0    0    1    4
    15    3    0    2    0    1    3]
 [   5    0    1    0  620    0    0    0    0    0    0    0    1    0
     0    0    0    0    1    0    0]
 [   1    0    6    6    0  917    4    1    0    0    0    3    7    1
     6    1    0    3    1    0    0]
 [   0    1    2    2    0   12  870    0    0    4    1   14    4    2
     2    1   22    0    4    0    0]
 [   6    0    5    3    0    0    3  473   10    1    1    1    7    0
     2    3    0    2    0    1    4]
 [   0    0    1    0    0    0    0   10  853    1    0    0    1    0
     0    0    0    0    1    0    0]
 [   2    

In [14]:
X, Y = list(df["text"]), list(df["language"])

# vectorize sentence X
vectorizer = CountVectorizer(analyzer='char')
X_features = count_vectorizer.fit_transform(X)

# vectorize label Y
label_encoder = preprocessing.LabelEncoder()
Y_features = label_encoder.fit_transform(Y)

In [15]:
models = [
    RandomForestClassifier(n_estimators=200),
    SVC(),
    GaussianNB(),
    LogisticRegression(),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models))) # no need
entries = []

In [None]:
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, 
                                 X_features.toarray(), 
                                 Y_features, 
                                 scoring='accuracy', 
                                 cv=CV, 
                                 n_jobs=-1                                
                                 )
    print(accuracies)    
    entries.append([model_name, sum(accuracies)/len(accuracies)])

cv_df = pd.DataFrame(entries, columns=['model_name', 'accuracy'])

cv_df