# Graphs

#### Import Libraries

In [2]:
%matplotlib inline

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn import svm
import xgboost as xgb
import pandas as pd
import numpy as np
import os

### Read the dataset

In [40]:
df = pd.read_csv("merged_comp_data.csv")

df = df.fillna(0)

### Information about dataset

In [41]:
# list of values of 'lang' column
languages = df.lang.unique()

# list of values of 'uniform_id' column
participants = df.uniform_id.unique()

# show the lenght of the language list
print(f'There are {len(languages)} languages in dataset')

# show the values of the languages
print(f'The languages in dataset are: {languages}')

# show the lenght of the participant list
print(f'There are {len(participants)} participants in dataset')


There are 12 languages in dataset
The languages in dataset are: ['du' 'gr' 'he' 'it' 'no' 'ee' 'fi' 'ge' 'en' 'ru' 'sp' 'tr']
There are 450 participants in dataset


### Categorize the languages

In [42]:
# create df for du language
du_lang = df[df['lang'] == 'du']

# create df for gr language
gr_lang = df[df['lang'] == 'gr']

# create df for he language
he_lang = df[df['lang'] == 'he']

# create df for it language
it_lang = df[df['lang'] == 'it']

# create df for no language
no_lang = df[df['lang'] == 'no']

# create df for ee language
ee_lang = df[df['lang'] == 'ee']

# create df for fi language
fi_lang = df[df['lang'] == 'fi']

# create df for ge language
ge_lang = df[df['lang'] == 'ge']

# create df for en language
en_lang = df[df['lang'] == 'en']

# create df for ru language
ru_lang = df[df['lang'] == 'ru']

# create df for sp language
sp_lang = df[df['lang'] == 'sp']

# create df for tr language
tr_lang = df[df['lang'] == 'tr']

# create a list of language dfs
lang_list = [du_lang,
             gr_lang,
             he_lang,
             it_lang,
             no_lang,
             ee_lang,
             fi_lang,
             ge_lang,
             en_lang,
             ru_lang,
             sp_lang,
             tr_lang]

# print the list of lang vars
print(f'Total language dataframes: {len(lang_list)}')


Total language dataframes: 12


### Categorize participants per language

In [43]:
for language in lang_list:
    # participant count
    participant_count = language.uniform_id.unique()

    # participant lang
    participant_lang = language.lang.unique()
    
    # show the lenght of the participant list
    print(f'There are {len(participant_count)} participants in {participant_lang} dataset')

There are 44 participants in ['du'] dataset
There are 24 participants in ['gr'] dataset
There are 40 participants in ['he'] dataset
There are 44 participants in ['it'] dataset
There are 28 participants in ['no'] dataset
There are 54 participants in ['ee'] dataset
There are 47 participants in ['fi'] dataset
There are 39 participants in ['ge'] dataset
There are 40 participants in ['en'] dataset
There are 41 participants in ['ru'] dataset
There are 34 participants in ['sp'] dataset
There are 15 participants in ['tr'] dataset


### Categorize PoS per language

In [44]:
for language in lang_list:
    # pos count
    pos_count = language.PoS.unique()
    
    # participant lang
    participant_lang = language.lang.unique()
    
    # show the lenght of the participant list
    print(f'There are {len(pos_count)} PoS in {participant_lang} dataset')


There are 31 PoS in ['du'] dataset
There are 31 PoS in ['gr'] dataset
There are 31 PoS in ['he'] dataset
There are 31 PoS in ['it'] dataset
There are 31 PoS in ['no'] dataset
There are 31 PoS in ['ee'] dataset
There are 31 PoS in ['fi'] dataset
There are 31 PoS in ['ge'] dataset
There are 31 PoS in ['en'] dataset
There are 31 PoS in ['ru'] dataset
There are 31 PoS in ['sp'] dataset
There are 31 PoS in ['tr'] dataset


### Create Pos Data for each language

In [95]:
language_pos_data = {}
for language in lang_list:
    pos_data = {}
    # pos count
    pos_count = language.PoS.unique()

    # language of dataframe
    participant_lang = language.lang.unique()[0]
    
    # show the lenght of the participant list
    #print(f'There are {len(pos_count)} PoS in {participant_lang} dataset')

    for pos in pos_count:
        # calculate the mean of the duration value for each PoS 
        pos_duration = language.loc[language['PoS'] == pos, 'dur'].mean()

        #print(f' --> Mean value for the {pos} is {pos_duration} in {participant_lang} language')

        # Add the mean value of each PoS to pos_data dictionary 
        pos_data[pos] = pos_duration
    
    # Add language data to dictionary
    language_pos_data[participant_lang] = dict(sorted(pos_data.items()))

    


### test

In [None]:
pos_data_per_language = {}
pos_count_var =  list(language_pos_data['du'].keys())
for pos in pos_count_var:
    language_data = {}
    for language in lang_list:
        pos_duration = language.loc[language['PoS'] == pos, 'dur'].mean()
        language_data[language] = pos_duration



# Graphs

### PoS data per language

In [97]:
# for language in language_pos_data:
#     fig, ax = plt.subplots()
#     fruits = list(language_pos_data[language].keys())
#     counts = list(language_pos_data[language].values())
#     bar_labels = list(language_pos_data[language].keys())
#     ax.bar(fruits, counts, label=bar_labels)
#     ax.set_ylabel('duration')
#     ax.set_title(f'Mean of durations per PoS value in {language} language')
#     fig.autofmt_xdate()
#     fig.set_size_inches(14,6)
#     plt.show()

### Language data per PoS