In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import pymorphy2
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing  import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import balanced_accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,sentence,language
0,"Pensez à la communication , le discours , les ...",fr
1,"Můžete si ji pronajmout , vzít na splátky , ko...",cs
2,"Každý starosta pochopil , že když mají tyto fo...",cs
3,"Det är ytterligare bevis , men ändå — Jag krit...",sv
4,كان الأمر لا يصدق .,ar


In [3]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,index,sentence
0,0,תודה לכם .
1,1,"Precisamos de compaixão para começar , e auto-..."
2,2,這個增長相當大 ， 並且它將引發經濟的增長 。
3,3,시애틀에서 자란 제가 처음 가난을 보게 되던 때를 기억해요 .
4,4,これをロボットに組み込みました


In [4]:
train_df.shape, test_df.shape

((3159633, 2), (2784634, 2))

In [5]:
lang_count = (train_df.language.value_counts().to_frame().reset_index())
lang_count.head()

Unnamed: 0,index,language
0,ru,181970
1,en,157655
2,fr,148585
3,vi,113193
4,ar,110032


In [6]:
label_encoder = LabelEncoder().fit(train_df.values[:, 1])
y = label_encoder.transform(train_df.values[:, 1])

In [7]:
train_df.sentence = train_df.sentence.str.lower()
test_df.sentence = test_df.sentence.str.lower()

In [8]:
test_df['sentence'] = test_df['sentence'].apply(lambda sentence: "".join(re.sub(r'[^\w\s]+|[\d]+', r'', sentence).strip()))
train_df['sentence'] = train_df['sentence'].apply(lambda sentence: "".join(re.sub(r'[^\w\s]+|[\d]+', r'', sentence).strip()))
train_df['sentence'].head(10)

0    pensez à la communication  le discours  les ge...
1    můžete si ji pronajmout  vzít na splátky  koup...
2    každý starosta pochopil  že když mají tyto for...
3    det är ytterligare bevis  men ändå  jag kritis...
4                                    كان الأمر لا يصدق
5                 na primjer  pjesnik ga opisuje ovako
6    semua rerumputan itu sekarang menutupi tanah s...
7        det är en enorm utmaning för oss att göra det
8    ono što ćete vidjeti  trebat ćete skočiti ovdj...
9                  alqışlar  exceldən istifadə etmişəm
Name: sentence, dtype: object

In [9]:
mystopwords = stopwords.words()

In [10]:
%%time
    
vectorizer = TfidfVectorizer(analyzer = 'char', stop_words = mystopwords, max_df = 0.8, min_df = 4, ngram_range = (3, 4)) 
text_vect_train = vectorizer.fit_transform(train_df.sentence.values)
text_vect_test = vectorizer.transform(test_df.sentence.values)

CPU times: user 17min 21s, sys: 57.8 s, total: 18min 19s
Wall time: 18min 36s


In [11]:
%%time

model = SGDClassifier(class_weight='balanced', random_state=123)
model.fit(text_vect_train, y)

CPU times: user 19min 31s, sys: 19.9 s, total: 19min 51s
Wall time: 19min 51s


SGDClassifier(class_weight='balanced', random_state=123)

In [12]:
LR_predict_train = model.predict(text_vect_train)

In [13]:
balanced_accuracy_score(y, LR_predict_train)

0.8996041494041876

In [14]:
LR_predict_test = model.predict(text_vect_test)

In [15]:
test_df['language'] = label_encoder.classes_[LR_predict_test]
test_df.head()

Unnamed: 0,index,sentence,language
0,0,תודה לכם,he
1,1,precisamos de compaixão para começar e autoin...,pt-br
2,2,這個增長相當大 並且它將引發經濟的增長,zh-tw
3,3,시애틀에서 자란 제가 처음 가난을 보게 되던 때를 기억해요,ko
4,4,これをロボットに組み込みました,ja


In [16]:
(test_df[['index',  'language']].to_csv('out.csv', index = False))

In [17]:
!head out.csv

index,language
0,he
1,pt-br
2,zh-tw
3,ko
4,ja
5,zh-tw
6,hu
7,nl
8,ru
