In [None]:
import numpy as np
import pandas as pd
# plots
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_colwidth', 100)

In [None]:
df_train = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/train.csv')
df_test = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/test.csv')
df_sub = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/sample_submission.csv')

In [None]:
df_train['n_char_context'] = df_train.context.str.len()
df_train['n_word_context'] = df_train.context.str.split().map(lambda x : len(x))
df_train['char_per_word_context'] = df_train.n_char_context / df_train.n_word_context

df_train['n_char_question'] = df_train.question.str.len()
df_train['n_word_question'] = df_train.question.str.split().map(lambda x : len(x))
df_train['char_per_word_question'] = df_train.n_char_question / df_train.n_word_question

df_train['n_char_answer'] = df_train.answer_text.str.len()
df_train['n_word_answer'] = df_train.answer_text.str.split().map(lambda x : len(x))
df_train['char_per_word_answer'] = df_train.n_char_answer / df_train.n_word_answer

new_features = ['n_char_context', 'n_word_context', 'char_per_word_context',
                'n_char_question', 'n_word_question', 'char_per_word_question',
                'n_char_answer', 'n_word_answer', 'char_per_word_answer']

In [None]:
df_train[new_features].describe()

In [None]:
for f in new_features:
    plt.figure(figsize=(10,4))
    plt.hist(df_train[f])
    plt.title('Histogram of ' + f)
    plt.grid()
    plt.show()

In [None]:
# language distribution
plt.figure(figsize=(10,5))
df_train.language.value_counts().plot(kind='bar')
plt.title('Language')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(7,7))
plt.scatter(df_train.n_word_context, df_train.n_word_question,
            alpha=0.25)
plt.title('Word count - Questions vs Context')
plt.xlabel('# Words Context')
plt.ylabel('# Words Question')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(7,7))
plt.scatter(df_train.n_char_context, df_train.n_char_question,
            alpha=0.25)
plt.title('Character count - Questions vs Context')
plt.xlabel('# Chars Context')
plt.ylabel('# Chars Question')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(7,7))
plt.scatter(df_train.char_per_word_context, df_train.char_per_word_question,
            alpha=0.25)
plt.title('Characters per Word - Question vs Context')
plt.xlabel('Chars/Word Context')
plt.ylabel('Chars/Word Question')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(7,7))
plt.scatter(df_train.n_word_question, df_train.n_word_answer,
            alpha=0.25)
plt.title('Word count - Answer vs Question')
plt.xlabel('# Words Question')
plt.ylabel('# Words Answer')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(7,7))
plt.scatter(df_train.n_char_question, df_train.n_char_answer,
            alpha=0.25)
plt.title('Character count - Answer vs Question')
plt.xlabel('# Chars Question')
plt.ylabel('# Chars Answer')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(7,7))
plt.scatter(df_train.char_per_word_question, df_train.char_per_word_answer,
            alpha=0.25)
plt.title('Characters per Word - Answer vs Question')
plt.xlabel('Chars/Word Question')
plt.ylabel('Chars/Word Answer')
plt.grid()
plt.show()

In [None]:
df_Q_longer_A = df_train[df_train.n_word_answer > df_train.n_word_question]
df_Q_longer_A

In [None]:
sns.pairplot(data=df_train[new_features+['language']], hue='language',
plot_kws={'alpha' : 0.25})
plt.show()

In [None]:
sns.violinplot(data=df_train, x='language', y='char_per_word_context')
plt.grid()
plt.title('Context')
plt.show()
sns.violinplot(data=df_train, x='language', y='char_per_word_question')
plt.grid()
plt.title('Question')
plt.show()
sns.violinplot(data=df_train, x='language', y='char_per_word_answer')
plt.grid()
plt.title('Answer')
plt.show()