# Imports

In [None]:
import sys
sys.path.append('../input/rich-text-formatting')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import random

import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
from plotly.offline import iplot
from wordcloud import WordCloud
from plotly.offline import iplot


from spacy.lang.hi import Hindi
from spacy.lang.ta import Tamil
from spacy.lang.hi import STOP_WORDS as hindi_stopwords
from spacy.lang.ta import STOP_WORDS as tamil_stopwords
from collections import Counter

In [None]:
train_df = pd.read_csv("/kaggle/input/chaii-hindi-and-tamil-question-answering/train.csv")
test_df = pd.read_csv("/kaggle/input/chaii-hindi-and-tamil-question-answering/test.csv")
submission_df = pd.read_csv("/kaggle/input/chaii-hindi-and-tamil-question-answering/sample_submission.csv")

# Let's Explore Train Data 

In [None]:
train_df.head()

In [None]:
language = train_df["language"].value_counts()
language_df = pd.DataFrame({"language":language.index,"frequency":language.values})
fig = px.bar(data_frame=language_df,x="language",y="frequency",title="Language Distribution",color="language",height=500,width=1000)
fig.show()

In [None]:
language = train_df["language"].value_counts()
language_df = pd.DataFrame({"language":language.index,"frequency":language.values})
hover_values = language.index
fig = px.pie(values=language_df["frequency"],labels=language_df["language"],title="Language Distribution",hover_name=hover_values)
fig.show()

## From this we can see clearly that hindi text is more than tamil text

## Analysis on Context data

In [None]:
hindi_context = train_df[train_df['language']=='hindi']['context'].str.len()
tamil_context = train_df[train_df['language']=='tamil']['context'].str.len()

fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Histogram(x=list(hindi_context), name='Hindi Context'),
    row=1, 
    col=1
)

fig.add_trace(
    go.Histogram(x=list(tamil_context), name='Tamil Context'),
    row=1, 
    col=2,
)




fig.update_layout(height=400, width=800, title_text="Character Count by Language")
iplot(fig)

In [None]:
hindi = train_df[train_df['language']=='hindi']['context'].str.split().map(lambda x: len(x))
tamil = train_df[train_df['language']=='tamil']['context'].str.split().map(lambda x: len(x))

fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Histogram(x=list(hindi), name='Hindi Context'),
    row=1, 
    col=1
)

fig.add_trace(
    go.Histogram(x=list(tamil), name='Tamil Context'),
    row=1, 
    col=2,
)

fig.update_layout(height=400, width=800, title_text="Word Count Distribution by Language")
iplot(fig)

## Analysis on Question data 

In [None]:
hindi_question = train_df[train_df['language']=='hindi']['question'].str.len()
tamil_question = train_df[train_df['language']=='tamil']['question'].str.len()
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Histogram(x=list(hindi), name='Hindi Question'),
    row=1, 
    col=1
)

fig.add_trace(
    go.Histogram(x=list(tamil), name='Tamil Question'),
    row=1, 
    col=2,
)

fig.update_layout(height=400, width=800, title_text="Character Count by Language")
iplot(fig)

In [None]:
hindi_question = train_df[train_df['language']=='hindi']['question'].str.split().map(lambda x: len(x))
tamil_question = train_df[train_df['language']=='tamil']['question'].str.split().map(lambda x: len(x))
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Histogram(x=list(hindi), name='Hindi Question'),
    row=1, 
    col=1
)

fig.add_trace(
    go.Histogram(x=list(tamil), name='Tamil Question'),
    row=1, 
    col=2,
)

fig.update_layout(height=400, width=800, title_text="Character Count by Language")
iplot(fig)

In [None]:
def generate_word_cloud(font_path,text):
    wordcloud = WordCloud(font_path=font_path,
        width = 3000,
        height = 2000,
        background_color = 'black').generate_from_frequencies(text)
    fig = plt.figure(
        figsize = (40, 30),
        facecolor = 'k',
        edgecolor = 'k')
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.show()

In [None]:
!wget -q http://www.lipikaar.com/sites/www.lipikaar.com/themes/million/images/support/fonts/Devanagari.zip
!wget -q http://www.lipikaar.com/sites/www.lipikaar.com/themes/million/images/support/fonts/Tamil.zip

!unzip -qq Devanagari.zip
!unzip -qq Tamil.zip

In [None]:
# Get the text for both the languages
tamil_text_question = " ".join(train_df[train_df["language"]=="tamil"]["question"])
hindi_text_question = " ".join(train_df[train_df["language"]=="hindi"]["question"])

In [None]:
hindi_nlp = Hindi()
hindi_nlp.max_length = 1030000 
hindi_doc = hindi_nlp(hindi_text_question)
hindi_tokens = set([token.text for token in hindi_doc])
hindi_tokens_counter = Counter(hindi_tokens)


# Get the tokens and frequencies for Tamil language
tamil_nlp = Tamil()
tamil_nlp.max_length = 1030000 
tamil_doc = hindi_nlp(tamil_text_question)
tamil_tokens = set([token.text for token in tamil_doc])
tamil_tokens_counter = Counter(tamil_tokens)

# WordCloud for Question

## Hindi

In [None]:
generate_word_cloud(font_path="Devanagari/Lohit-Devanagari.ttf",text=hindi_tokens_counter)

## Tamil

In [None]:
generate_word_cloud(font_path="Tamil/Lohit-Tamil.ttf",text=tamil_tokens_counter)

# WordCloud for Context

In [None]:
# Get the text for both the languages
tamil_text_context = " ".join(train_df[train_df["language"]=="tamil"]["context"])
hindi_text_context = " ".join(train_df[train_df["language"]=="hindi"]["context"])

In [None]:
hindi_nlp = Hindi()
hindi_nlp.max_length = 7568961  # Specify more values as it contains more words in it .

hindi_doc = hindi_nlp(hindi_text_context)
hindi_tokens = set([token.text for token in hindi_doc])
hindi_tokens_counter = Counter(hindi_tokens)


# Get the tokens and frequencies for Tamil language
tamil_nlp = Tamil()
tamil_nlp.max_length = 7568961  # Specify more values as it contains more words in it .

tamil_doc = hindi_nlp(tamil_text_context)
tamil_tokens = set([token.text for token in tamil_doc])
tamil_tokens_counter = Counter(tamil_tokens)

## Hindi

In [None]:
generate_word_cloud(font_path="Devanagari/Lohit-Devanagari.ttf",text=hindi_tokens_counter)

## There are some tamila and other text present here ,that's why some weird images is drawn

## Tamil

In [None]:
generate_word_cloud(font_path="Tamil/Lohit-Tamil.ttf",text=tamil_tokens_counter)