# Creation of Language Training Profiles

## 1. Required Imports

In [8]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import pandas as pd

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ashwinkumar/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## 2. Read Wili-2018 Dataset and filter it for 37 European Languages

In [4]:
data = pd.read_csv('/Users/ashwinkumar/Downloads/Language Detection/wili-2018/x_train.txt', sep='\r\n', header=None, engine='python')
res = pd.read_csv('/Users/ashwinkumar/Downloads/Language Detection/wili-2018/y_train.txt', sep='\r\n', header=None, engine='python')
langs = ['eng']
#'swe','nld','tur','lat','dan','por','fra','bos','bul','gle','be-tarask','spa','isl','cym','srp','ita','fin','cat','ron','ces','rus','lit','sqi','slk','ukr','hrv','eng','mlt','hye','nno','hun','deu','mwl','pol','ell','ltz']
data['1'] = res
data.columns = ['text', 'lang']
data = data[data['lang'].isin(langs)].reset_index(drop=True)
data.head()

Unnamed: 0,text,lang
0,In 1978 Johnson was awarded an American Instit...,eng
1,Bussy-Saint-Georges has built its identity on ...,eng
2,Minnesota's state parks are spread across the ...,eng
3,Nordahl Road is a station served by North Coun...,eng
4,A talk by Takis Fotopoulos about the Internati...,eng


## 3. Print filtered Lang Samples and Shape of Dataframe

In [5]:
print(data.head())
data.shape

                                                text lang
0  In 1978 Johnson was awarded an American Instit...  eng
1  Bussy-Saint-Georges has built its identity on ...  eng
2  Minnesota's state parks are spread across the ...  eng
3  Nordahl Road is a station served by North Coun...  eng
4  A talk by Takis Fotopoulos about the Internati...  eng


(500, 2)

## 4. Preprocess Lang Data(Remove punctuations and digits)

In [6]:
data['text'] = data['text'].str.replace(r'[^\w\s]+', '')
data['text'] = data['text'].str.replace(r'[\d]+', '')
data['text'] = data['text'].astype(str).str.lower()
data.shape

(500, 2)

## 4. Tokenize Lang Data to Prepare Lang Profiles

In [9]:
#data = data.groupby('lang')['text'].apply(' '.join).reset_index()
data['text'] = data.text.apply(word_tokenize)
data.shape

(500, 2)

## 5. Function to Create N-GRAMS

In [10]:
def word2ngrams(text, n=3):
  """ Convert word into character ngrams. """
  return [text[i:i+n] for i in range(len(text)-n+1)]

## 6. Generate Lang Profiles using Mono, Bi and Tri GRAMS

In [11]:
ngrams = []
data['ngrams'] = ''
for lang in data.text:
    lang_ngram = []
    for word in lang:
        lang_ngram = lang_ngram + word2ngrams(word,1) + word2ngrams(word,2) + word2ngrams(word, 3)
    lang_ngram = sorted(list(set(lang_ngram)), key=len)
    ngrams.append(lang_ngram)

for i in range(len(ngrams)):
    data.at[i, 'ngrams'] = ngrams[i]
data.shape

(500, 3)

In [12]:
data.head()

Unnamed: 0,text,lang,ngrams
0,"[in, johnson, was, awarded, an, american, inst...",eng,"[p, f, n, u, a, e, o, m, d, z, k, b, h, w, i, ..."
1,"[bussysaintgeorges, has, built, its, identity,...",eng,"[p, s, y, f, n, v, u, a, e, o, m, q, d, k, b, ..."
2,"[minnesotas, state, parks, are, spread, across...",eng,"[u, k, t, r, g, p, y, a, e, o, b, h, i, c, l, ..."
3,"[nordahl, road, is, a, station, served, by, no...",eng,"[p, y, f, n, v, u, a, e, o, m, d, k, b, h, w, ..."
4,"[a, talk, by, takis, fotopoulos, about, the, i...",eng,"[p, y, f, n, v, u, a, e, o, m, z, d, k, b, h, ..."


## 7. Write Generated Lang Profiles in JSON

In [13]:
data.to_json('/Users/ashwinkumar/Downloads/Language Detection/wili-2018/lang_profiles.json', orient='values')

# Detect Language using Test Language Profiles

## 1. Load Test Data

In [15]:
test_data = pd.read_csv('/Users/ashwinkumar/Downloads/Language Detection/wili-2018/x_test.txt', sep='\r\n', header=None, engine='python')
test_res = pd.read_csv('/Users/ashwinkumar/Downloads/Language Detection/wili-2018/y_test.txt', sep='\r\n', header=None, engine='python')
test_langs = ['eng']
    #'swe','nld','tur','lat','dan','por','fra','bos','bul','gle','be-tarask','spa','isl','cym','srp','ita','fin','cat','ron','ces','rus','lit','sqi','slk','ukr','hrv','eng','mlt','hye','nno','hun','deu','mwl','pol','ell','ltz']
test_data['1'] = test_res
test_data.columns = ['text', 'lang']
test_data = test_data[test_data['lang'].isin(langs)].reset_index(drop=True)
test_data.head()

Unnamed: 0,text,lang
0,Anton (or Antonius) Maria Schyrleus (also Schy...,eng
1,Zvi Zamir (Hebrew: צבי זמיר‎) born Zvicka Zarz...,eng
2,Texas A&M hired Kennedy in May 2011 after Mark...,eng
3,"Philip Johnson architectural drawings, 1943-19...",eng
4,"AVCO Embassy Pictures, the film's financial ba...",eng


## 2. Print filtered Lang Test Samples and Shape of Dataframe

In [16]:
print(test_data.head())
test_data.shape

                                                text lang
0  Anton (or Antonius) Maria Schyrleus (also Schy...  eng
1  Zvi Zamir (Hebrew: צבי זמיר‎) born Zvicka Zarz...  eng
2  Texas A&M hired Kennedy in May 2011 after Mark...  eng
3  Philip Johnson architectural drawings, 1943-19...  eng
4  AVCO Embassy Pictures, the film's financial ba...  eng


(500, 2)

## 3. Preprocess Test Lang Data(Remove punctuations and digits)

In [17]:
test_data['text'] = test_data['text'].str.replace(r'[^\w\s]+', '')
test_data['text'] = test_data['text'].str.replace(r'[\d]+', '')
test_data['text'] = test_data['text'].astype(str).str.lower()
test_data.head()

Unnamed: 0,text,lang
0,anton or antonius maria schyrleus also schyrl ...,eng
1,zvi zamir hebrew צבי זמיר born zvicka zarzevsk...,eng
2,texas am hired kennedy in may after mark turg...,eng
3,philip johnson architectural drawings bulk he...,eng
4,avco embassy pictures the films financial back...,eng


## 4. Tokenize Test Lang Data to Prepare Lang Profiles

In [123]:
test_data['text'] = test_data.text.apply(word_tokenize)
test_data.head()

Unnamed: 0,text,lang
0,"[schiedam, is, gelegen, tussen, rotterdam, en,...",nld
1,"[savannklimat, råder, i, trakten, årsmedeltemp...",swe
2,"[kaye, friade, till, sin, flickvän, pianisten,...",swe
3,"[pantopipetta, brevipilata, är, en, havsspinde...",swe
4,"[moffatia, plumicauda, is, een, vlinder, uit, ...",nld


## 5. Generate Test Lang Profiles using Mono, Bi and Tri GRAMS

In [124]:
ngrams = []
test_data['ngrams'] = ''
for lang in test_data.text:
    lang_ngram = []
    for word in lang:
        lang_ngram = lang_ngram + word2ngrams(word,1) + word2ngrams(word,2) + word2ngrams(word, 3)
    lang_ngram = sorted(list(set(lang_ngram)), key=len)
    ngrams.append(lang_ngram)
#print(ngrams)
for i in range(len(ngrams)):
    test_data.at[i, 'ngrams'] = ngrams[i]
test_data.head()

Unnamed: 0,text,lang,ngrams
0,"[schiedam, is, gelegen, tussen, rotterdam, en,...",nld,"[b, j, e, t, m, v, s, r, l, p, u, g, d, h, w, ..."
1,"[savannklimat, råder, i, trakten, årsmedeltemp...",swe,"[b, å, j, e, t, m, v, s, r, l, p, ä, u, g, d, ..."
2,"[kaye, friade, till, sin, flickvän, pianisten,...",swe,"[å, r, l, p, g, ö, z, e, v, s, u, w, k, m, n, ..."
3,"[pantopipetta, brevipilata, är, en, havsspinde...",swe,"[b, å, j, e, t, m, v, s, r, p, l, ä, u, g, d, ..."
4,"[moffatia, plumicauda, is, een, vlinder, uit, ...",nld,"[b, y, j, e, t, m, v, s, r, p, l, u, g, d, h, ..."


In [125]:
test_data.shape

(1000, 3)

In [126]:
print(data['ngrams'][0])
print(test_data['ngrams'][0])

['b', 'y', 'j', 'å', 'e', 't', 'm', 's', 'r', 'p', 'l', 'u', 'g', 'd', 'h', 'i', 'o', 'c', 'n', 'f', 'a', 'k', 'su', 'is', 're', 'ri', 'ar', 'in', 'nd', 'ec', 'ra', 'eb', 'of', 'as', 'sk', 'it', 'ot', 'bi', 'me', 'hi', 'at', 'di', 'er', 'he', 'ns', 'br', 'ir', 'ph', 'ru', 'ma', 'på', 'bl', 'ro', 'ea', 'si', 'rc', 'li', 'ui', 'tr', 'ne', 'ia', 'ho', 'je', 'ty', 'us', 'no', 'ep', 'ss', 'io', 'or', 'om', 'se', 'ca', 'st', 'ng', 'en', 'es', 'be', 'ci', 'an', 'ib', 'ti', 'os', 'th', 'jo', 'ut', 'tu', 'pe', 'ts', 'ch', 'ei', 'to', 'ic', 'ry', 'itu', 'sui', 'ira', 'bes', 'jes', 'seb', 'tre', 'rea', 'erc', 'ere', 'ebe', 'sti', 'ssi', 'eir', 'ici', 'dia', 'jos', 'esu', 'per', 'rch', 'oth', 'nor', 'rei', 'ric', 'lio', 'mas', 'ose', 'tit', 'ian', 'bri', 'ist', 'ris', 'ino', 'tho', 'his', 'eat', 'bli', 'sin', 'oma', 'ary', 'ner', 'uti', 'ibr', 'tut', 'rom', 'ori', 'the', 'ins', 'oru', 'nst', 'iar', 'hin', 'bib', 'hom', 'nsk', 'eph', 'sto', 'iot', 'its', 'sep', 'lib', 'uss', 'sia', 'hec', 'rus', 't

In [127]:
data.shape

(1000, 3)

In [None]:
UB = 0.95
LB = 0.92

for i in range(len(test_data)):
    for j in range(len(data)):
        data.at[j, 'match_rate_lang_'+str(i)] = len(set(test_data.iloc[i]['ngrams']).intersection(data.iloc[j]['ngrams']))/len(data.iloc[j]['ngrams'])
data.head()

In [108]:
print('Language\tScore')
for i in range(len(test_data)):
    print(data.loc[data['match_rate_lang_'+str(i)].idxmax()]['lang']+'\t'+str(data.loc[data['match_rate_lang_'+str(i)].idxmax()]['match_rate_lang_'+str(i)]))

Language	Score
nld	0.5915492957746479
nld	0.7853107344632768
nld	0.42613636363636365
nld	0.7810650887573964
nld	0.42962962962962964
nld	0.3898305084745763
nld	0.6637931034482759
nld	0.7928994082840237
nld	0.5638297872340425
nld	0.6513761467889908
nld	0.6190476190476191
nld	0.562962962962963
nld	0.6206896551724138
nld	0.5460526315789473
nld	0.4619289340101523
nld	0.7276595744680852
nld	0.43783783783783786
nld	0.6699029126213593
nld	0.6810344827586207
nld	0.6551724137931034
nld	0.7302631578947368
nld	0.5562913907284768
nld	0.6
nld	0.5105263157894737
nld	0.36507936507936506
nld	0.6510067114093959
nld	0.6260162601626016
nld	0.861878453038674
nld	0.7073170731707317
nld	0.6864864864864865
nld	0.5
nld	0.8496732026143791
nld	0.7816901408450704
nld	0.7751479289940828
nld	0.4966887417218543
nld	0.528169014084507
nld	0.5603448275862069
nld	0.603448275862069
nld	0.6454545454545455
nld	0.8165680473372781
nld	0.5789473684210527
nld	0.6180555555555556
nld	0.6796536796536796
nld	0.8186813186813187
nld

nld	0.7005649717514124
nld	0.4014084507042254
nld	0.6120689655172413
nld	0.5408805031446541
nld	0.5673076923076923
nld	0.5862068965517241
nld	0.8157894736842105
nld	0.5563380281690141
nld	0.6293103448275862
nld	0.6973684210526315
nld	0.47413793103448276
nld	0.6810344827586207
nld	0.6379310344827587
nld	0.4482758620689655
nld	0.6369426751592356
nld	0.5704697986577181
nld	0.3888888888888889
nld	0.6551724137931034
nld	0.4972972972972973
nld	0.6255924170616114
nld	0.6724137931034483
nld	0.5416666666666666
nld	0.5517241379310345
nld	0.776536312849162
nld	0.6666666666666666
nld	0.5
nld	0.42105263157894735
nld	0.5231788079470199
nld	0.6529411764705882
nld	0.5471698113207547
nld	0.556390977443609
nld	0.5
nld	0.5939849624060151
nld	0.7758620689655172
nld	0.5310344827586206
nld	0.6691729323308271
nld	0.7966101694915254
nld	0.8229166666666666
nld	0.5481481481481482
nld	0.5609756097560976
nld	0.5714285714285714
nld	0.8315789473684211
nld	0.6702702702702703
nld	0.6338028169014085
nld	0.739436619718

In [109]:
results = []
for i in range(len(test_data)):
    results.append({'Language':data.loc[data['match_rate_lang_'+str(i)].idxmax()]['lang'], 'Score':data.loc[data['match_rate_lang_'+str(i)].idxmax()]['match_rate_lang_'+str(i)]})
res_df = pd.DataFrame(results)
res_df.head()

Unnamed: 0,Language,Score
0,nld,0.591549
1,nld,0.785311
2,nld,0.426136
3,nld,0.781065
4,nld,0.42963


In [110]:
res_df['Score'].mean()

0.628117824575719