# Some basic explorations including length of texts

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from statsmodels.graphics.mosaicplot import mosaic

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import time

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# import data
t1 = time.time()
df_train = pd.read_csv('../input/contradictory-my-dear-watson/train.csv')
df_test = pd.read_csv('../input/contradictory-my-dear-watson/test.csv')
t2 = time.time()
print(t2-t1)

# Basic explorations

In [None]:
df_train.head()

In [None]:
print('Dim of training set: ', df_train.shape)
print('Dim of test:          ', df_test.shape)

In [None]:
df_train.label.value_counts().plot(kind='bar')
plt.title('Distribution of labels - Training')
plt.grid()
plt.show()

#### Nicely balanced labels!

In [None]:
foo = df_train.language.value_counts()
data = {'language':foo.index.to_list(), 'count':foo.values}
df_plot = pd.DataFrame(data)
fig = px.bar(df_plot, x='language', y='count')
fig.update_layout(title='Languages - Training')
fig.show()

In [None]:
foo = df_test.language.value_counts()
data = {'language':foo.index.to_list(), 'count':foo.values}
df_plot = pd.DataFrame(data)
fig = px.bar(df_plot, x='language', y='count')
fig.update_layout(title='Languages - Test')
fig.show()

In [None]:
# show exampe of text pairs
my_index = 20
print('Premise:    ', df_train.premise[my_index])
print('Hypothesis: ', df_train.hypothesis[my_index])

In [None]:
# show another exampe of text pairs (not English)
my_index = 2
print('Premise:    ', df_train.premise[my_index])
print('Hypothesis: ', df_train.hypothesis[my_index])

# Focus on English texts

In [None]:
df_train_en = df_train.loc[df_train.language=='English'].copy()
df_train_en.shape

In [None]:
# plot wordcloud for premise (training)
text = " ".join(txt for txt in df_train_en.premise)
stopwords = set(STOPWORDS)

wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                      width = 600, height = 400,
                      background_color="white").generate(text)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# plot wordcloud for hypothesis (training)
text = " ".join(txt for txt in df_train_en.hypothesis)
stopwords = set(STOPWORDS)

wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                      width = 600, height = 400,
                      background_color="white").generate(text)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

### Analyse length [characters] of texts (English only)

In [None]:
df_train_en['premise_len'] = pd.to_numeric(list(map(len, df_train_en.premise)))
df_train_en['hypothesis_len'] = pd.to_numeric(list(map(len, df_train_en.hypothesis)))

In [None]:
df_train_en.premise_len.describe()

In [None]:
df_train_en.hypothesis_len.describe()

In [None]:
df_train_en.premise_len.plot(kind='hist')
plt.title('Length of premise - Training (English)')
plt.grid()
plt.show()

In [None]:
df_train_en.hypothesis_len.plot(kind='hist')
plt.title('Length of hypothesis - Training (English)')
plt.grid()
plt.show()

In [None]:
# interactive scatter plot - display label using color
fig = px.scatter(x=df_train_en.premise_len, y=df_train_en.hypothesis_len, color=df_train_en.label, opacity=0.5)
fig.update_layout(title='Compare lengths - Training (English)',
                  xaxis_title='Length of premise',  
                  yaxis_title='Length of hypothesis')
fig.update_traces(marker_coloraxis=None) # hide colorbar
fig.show()

In [None]:
# smoothed plot using Seaborn
sns.jointplot(df_train_en.premise_len, df_train_en.hypothesis_len, kind='kde')
plt.title('Compare lengths - Training (English)')
plt.grid()

# Check impact of features on target (label)

In [None]:
# create some features first

df_train['premise_len'] = pd.to_numeric(list(map(len, df_train.premise)))
df_train['hypothesis_len'] = pd.to_numeric(list(map(len, df_train.hypothesis)))

# same for test set
df_test['premise_len'] = pd.to_numeric(list(map(len, df_test.premise)))
df_test['hypothesis_len'] = pd.to_numeric(list(map(len, df_test.hypothesis)))

# ratios
df_train['len_ratio'] = df_train['hypothesis_len'] / df_train['premise_len']
df_test['len_ratio'] = df_test['hypothesis_len'] / df_test['premise_len']

In [None]:
df_train.len_ratio.plot(kind='hist')
plt.grid()
plt.show()

In [None]:
df_test.len_ratio.plot(kind='hist')
plt.grid()
plt.show()

In [None]:
plt.rcParams['figure.figsize']=(20,6)
mosaic(df_train, ['lang_abv','label'])
plt.show()

In [None]:
# bin in equal size buckets using quantile based cut
df_train['premise_len_binned'] = pd.qcut(df_train.premise_len,q=10)
df_train['hypothesis_len_binned'] = pd.qcut(df_train.hypothesis_len,q=10)

plt.rcParams['figure.figsize']=(10,6)
df_train.premise_len_binned.value_counts().plot(kind='bar')
plt.grid()
plt.show()

In [None]:
plt.rcParams['figure.figsize']=(20,6)
mosaic(df_train, ['premise_len_binned','label'])
plt.show()

In [None]:
plt.rcParams['figure.figsize']=(20,6)
mosaic(df_train, ['hypothesis_len_binned','label'])
plt.show()

In [None]:
# bin in equal size buckets using quantile based cut
df_train['len_ratio'] = pd.qcut(df_train.len_ratio,q=10)

In [None]:
plt.rcParams['figure.figsize']=(20,6)
mosaic(df_train, ['len_ratio','label'])
plt.show()

#### Ok, this looks pretty significant: For high ratios (right-most stacked bar) the label 1 is much more likely than for very low ratios (left-most stacked bar).

# Make enhanced data available for download

In [None]:
df_train.to_csv('df_train.csv')
df_test.to_csv('df_test.csv')