In [None]:
import os
import re
import string
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.corpus import stopwords
from pprint import pprint
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
import seaborn as sns
import matplotlib as mp
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import plot_confusion_matrix,classification_report,confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import warnings
warnings.filterwarnings('ignore')

# Reading the files

In [None]:
data_dir='/kaggle/input/feedback-prize-2021/'

In [None]:
os.listdir(data_dir)

This directory has 2 files and two folders test and train

Lets check the files in the train folder

In [None]:
train_path=data_dir+"/train/"

In [None]:
train_path

In [None]:
len(os.listdir(train_path)) # total 15594 files

In [None]:
os.listdir(train_path)[0:10]

Reading the top ten files

In [None]:
for files in os.listdir(train_path)[0:10]:
  file = open(train_path+files,'r')
  print('\n')
  print(files)
  print('\n')
  while True:
      next_line = file.readline()

      if not next_line:
          break;
      print(next_line.strip())

  file.close()

# NER and POS tagging with NLTK and Spacy

In [None]:
file='62C57C524CD2.txt'

In [None]:

with open(train_path+file, 'r') as file:
    data = file.read().replace('\n', '')

In [None]:
data

In [None]:
# Word tokenization and part of speech tagging
text = nltk.word_tokenize(data)
text = nltk.pos_tag(text)
print(text)

a list of tuples containing the individual words in the sentence and their associated part-of-speech

implement noun phrase chunking to identify named entities using a regular expression consisting of rules

IOB tags to represent the chunk structures

In [None]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [None]:
chunk_parser = nltk.RegexpParser(pattern)
text = chunk_parser.parse(text)
iob_tagged = tree2conlltags(text)
pprint(iob_tagged)

NER tagging with NLTK and ne_chunk

In [None]:
ne_tree = ne_chunk(pos_tag(word_tokenize(data)))
print(ne_tree)

NER tagging with Spacy

In [None]:
doc=nlp(data)

In [None]:

pprint([(X.text, X.label_) for X in doc.ents])

Bs is tagged as a person which is not correct. only 1 is Cardinal correct and 4 percent as Percent is correct.

token-level entity annotation using the BILUO tagging scheme to describe the entity boundaries

In [None]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

"B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set.

In [None]:
[(x.orth_,x.pos_) for x in [y 
                                      for y
                                      in nlp(data) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

In [None]:
displacy.render(nlp(data), jupyter=True, style='ent')

#### For the top ten text, lets do the NER and POS tagging

In [None]:
for files in os.listdir(train_path)[0:10]:
  file = open(train_path+files,'r')
  print('file name:',file)
  print('\n')
  data = file.read().replace('\n', '')
  print(data)
  print('\n')
  print("Part of speech tagging")
  print("\n")
  print([(x.orth_,x.pos_) for x in [y 
                                      for y
                                      in nlp(data) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']])
  print('\n')
  displacy.render(nlp(data), jupyter=True, style='ent')
  print('\n')
  file.close()

#### GPE : countries, cities, states.
#### CARDINAL: numerals. 
#### NORP: nationalities or religious groups or political groups

Will do the same for the text files in the test directory

In [None]:
test_path=data_dir+"/test/"

In [None]:
len(os.listdir(test_path)) # 5 text files

In [None]:
for files in os.listdir(test_path):
  file = open(test_path+files,'r')
  print('file name:',file)
  print('\n')
  data = file.read().replace('\n', '')
  print(data)
  print('\n')
  print("Part of speech tagging")
  print("\n")
  print([(x.orth_,x.pos_) for x in [y 
                                      for y
                                      in nlp(data) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']])
  print('\n')
  displacy.render(nlp(data), jupyter=True, style='ent')
  print('\n')
  file.close()

Reading train.csv file


In [None]:
df_train = pd.read_csv(data_dir+"/train.csv")
df_train.head()

In [None]:
# will subset the dataframe keeping the id,discourse_text,discourse_type
df_train = df_train[["id","discourse_text","discourse_type"]]
df_train.head(10)

In [None]:
df_train.shape

In [None]:
# lets check the distribution of the discourse_type
sns.set(rc={'figure.figsize':(14,8)})
sns.countplot(data=df_train,x='discourse_type')

In [None]:
df_train['discourse_type'].value_counts()

In [None]:
# any null columns
df_train.isnull().sum()

#### Data preprocessing step

In [None]:
# remove all characters not number or characters
def cleanText(input_string):
    modified_string = re.sub('[^A-Za-z0-9]+', ' ', input_string)
    modified_string = re.sub('[0-9]+', ' ', modified_string)
    modified_string=re.sub("[@]","",modified_string)
    return(modified_string)
df_train['discourse_text'] = df_train.discourse_text.apply(cleanText)
df_train['discourse_text'][150]

In [None]:
# Remove non printable characters
def remove_not_ASCII(text):
    text = ''.join([word for word in text if word in string.printable])
    return text
df_train['discourse_text'] = df_train.discourse_text.apply(remove_not_ASCII)
df_train['discourse_text'][150]

In [None]:
#converting to lower case
df_train['discourse_text']=df_train['discourse_text'].str.lower()

In [None]:
#removing punctuations
df_train['discourse_text']=df_train['discourse_text'].str.translate(str.maketrans('','',string.punctuation))

In [None]:
nltk.download('stopwords')

In [None]:
stopWords=stopwords.words('english')
def removeStopWords(stopWords, rvw_txt):
    newtxt = ' '.join([word for word in rvw_txt.split() if word not in stopWords])
    return newtxt
df_train['discourse_text'] = [removeStopWords(stopWords,x) for x in df_train['discourse_text']]

In [None]:
#remove words containing numbers
df_train['discourse_text']=df_train['discourse_text'].apply(lambda x:re.sub('\w*\d\w*' , '', x) )

In [None]:
from nltk.util import ngrams

In [None]:
#splitting text into words
tokenList=[]
for indx in range(len(df_train)):
       token=word_tokenize(df_train['discourse_text'][indx])
       tokenList.append(token)
df_train['text_tokens'] = tokenList
df_train.head()

#### n grams

#### Unigram

In [None]:
cntvec = CountVectorizer(ngram_range=(1,1))
cntvec.fit(df_train['discourse_text'])
unigrams = cntvec.transform(df_train['discourse_text'])
total_words = unigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:50]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:50]
df_ngrams



In [None]:
fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Unigram Distribution')
ax.set_xlabel("Unigrams")
ax.set_ylabel("Frequency")

plt.show()


#### bigrams

In [None]:
cntvec = CountVectorizer(ngram_range=(2,2))
cntvec.fit(df_train['discourse_text'])
bigrams = cntvec.transform(df_train['discourse_text'])
total_words = bigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:20]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:50]
df_ngrams

In [None]:
fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Bigrams Distribution')
ax.set_xlabel("Bigrams")
ax.set_ylabel("Frequency")

plt.show()

### trigrams

In [None]:
cntvec = CountVectorizer(ngram_range=(3,3))
cntvec.fit(df_train['discourse_text'])
trigrams = cntvec.transform(df_train['discourse_text'])
total_words = trigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:50]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:50]
df_ngrams

In [None]:
fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Trigrams Distribution')
ax.set_xlabel("Trigrams")
ax.set_ylabel("Frequency")

plt.show()

# Wordcloud visualization

#### Wordcloud and ngrams by Discourse Type

#### Discourse Type = Claim

In [None]:
df_train_claim = df_train[df_train.discourse_type == 'Claim']
cntvec = CountVectorizer(ngram_range=(1,1))
cntvec.fit(df_train_claim['discourse_text'])
unigrams = cntvec.transform(df_train_claim['discourse_text'])
total_words = unigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]
df_ngrams


In [None]:
fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Unigram Distribution for Discourse Type Claim')
ax.set_xlabel("Unigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
cntvec = CountVectorizer(ngram_range=(2,2))
cntvec.fit(df_train_claim['discourse_text'])
bigrams = cntvec.transform(df_train_claim['discourse_text'])
total_words = bigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]

fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Bigrams Distribution for Discourse Type Claim')
ax.set_xlabel("Bigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
cntvec = CountVectorizer(ngram_range=(3,3))
cntvec.fit(df_train_claim['discourse_text'])
trigrams = cntvec.transform(df_train_claim['discourse_text'])
total_words = trigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]

fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Trigrams Distribution for Discourse Type Claim')
ax.set_xlabel("Trigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
text=' ' .join([str(item) for item in df_train_claim['discourse_text'] ])
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
plt.figure(figsize=(15, 10), dpi=80)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

#### Claim discourse text has top words as student, school,college,teacher,car,driverless etc

#### discourse type = Evidence

In [None]:
df_train_evidence = df_train[df_train.discourse_type == 'Evidence']
cntvec = CountVectorizer(ngram_range=(1,1))
cntvec.fit(df_train_evidence['discourse_text'])
unigrams = cntvec.transform(df_train_evidence['discourse_text'])
total_words = unigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]
df_ngrams

In [None]:
fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Unigram Distribution for Discourse Type Evidence')
ax.set_xlabel("Unigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
cntvec = CountVectorizer(ngram_range=(2,2))
cntvec.fit(df_train_evidence['discourse_text'])
bigrams = cntvec.transform(df_train_evidence['discourse_text'])
total_words = bigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]

fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Bigrams Distribution for Discourse Type Evidence')
ax.set_xlabel("Bigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
cntvec = CountVectorizer(ngram_range=(3,3))
cntvec.fit(df_train_evidence['discourse_text'])
trigrams = cntvec.transform(df_train_evidence['discourse_text'])
total_words = trigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]

fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Trigrams Distribution for Discourse Type Evidence')
ax.set_xlabel("Trigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
text=' ' .join([str(item) for item in df_train_evidence['discourse_text'] ])
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
plt.figure(figsize=(15, 10), dpi=80)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

#### top words as electoral college, teacher,student, problem,friend etc

discourse type = Position

In [None]:
df_train_position = df_train[df_train.discourse_type == 'Position']
cntvec = CountVectorizer(ngram_range=(1,1))
cntvec.fit(df_train_position['discourse_text'])
unigrams = cntvec.transform(df_train_position['discourse_text'])
total_words = unigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]
df_ngrams

In [None]:
fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Unigram Distribution for Discourse Type Position')
ax.set_xlabel("Unigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
cntvec = CountVectorizer(ngram_range=(2,2))
cntvec.fit(df_train_position['discourse_text'])
bigrams = cntvec.transform(df_train_position['discourse_text'])
total_words = bigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]

fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Bigrams Distribution for Discourse Type Position')
ax.set_xlabel("Bigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
cntvec = CountVectorizer(ngram_range=(3,3))
cntvec.fit(df_train_position['discourse_text'])
trigrams = cntvec.transform(df_train_position['discourse_text'])
total_words = trigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]

fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Trigrams Distribution for Discourse Type Position')
ax.set_xlabel("Trigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
text=' ' .join([str(item) for item in df_train_position['discourse_text'] ])
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
plt.figure(figsize=(15, 10), dpi=80)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

#### top words as electoral college,popular,vote, community etc

#### discourse type = Concluding Statement    

In [None]:
df_train_concstat = df_train[df_train.discourse_type == 'Concluding Statement']
cntvec = CountVectorizer(ngram_range=(1,1))
cntvec.fit(df_train_concstat['discourse_text'])
unigrams = cntvec.transform(df_train_concstat['discourse_text'])
total_words = unigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]
df_ngrams

In [None]:
fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Unigram Distribution for Discourse Type Concluding Statement')
ax.set_xlabel("Unigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
cntvec = CountVectorizer(ngram_range=(2,2))
cntvec.fit(df_train_concstat['discourse_text'])
bigrams = cntvec.transform(df_train_concstat['discourse_text'])
total_words = bigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]

fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Bigrams Distribution for Discourse Type Concluding Statement')
ax.set_xlabel("Bigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
cntvec = CountVectorizer(ngram_range=(3,3))
cntvec.fit(df_train_concstat['discourse_text'])
trigrams = cntvec.transform(df_train_concstat['discourse_text'])
total_words = trigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]

fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Trigrams Distribution for Discourse Type Concluding Statement')
ax.set_xlabel("Trigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
text=' ' .join([str(item) for item in df_train_concstat['discourse_text'] ])
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
plt.figure(figsize=(15, 10), dpi=80)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

#### for concluding statement top words as think,work,need,life,better,help etc

discourse type = Lead

In [None]:
df_train_lead = df_train[df_train.discourse_type == 'Lead']
cntvec = CountVectorizer(ngram_range=(1,1))
cntvec.fit(df_train_lead['discourse_text'])
unigrams = cntvec.transform(df_train_lead['discourse_text'])
total_words = unigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Unigram Distribution for Discourse Type Lead')
ax.set_xlabel("Unigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
cntvec = CountVectorizer(ngram_range=(2,2))
cntvec.fit(df_train_lead['discourse_text'])
bigrams = cntvec.transform(df_train_lead['discourse_text'])
total_words = bigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]

fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Bigrams Distribution for Discourse Type Lead')
ax.set_xlabel("Bigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
cntvec = CountVectorizer(ngram_range=(3,3))
cntvec.fit(df_train_lead['discourse_text'])
trigrams = cntvec.transform(df_train_lead['discourse_text'])
total_words = trigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]

fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Trigrams Distribution for Discourse Type Lead')
ax.set_xlabel("Trigrams")
ax.set_ylabel("Frequency")

plt.show()


#### Refrence https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da

In [None]:
text=' ' .join([str(item) for item in df_train_lead['discourse_text'] ])
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
plt.figure(figsize=(15, 10), dpi=80)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

#### for discourse type as lead top words as college, student,electoral, school, phone etc

discourse_type=Counterclaim

In [None]:
df_train_countclaim = df_train[df_train.discourse_type == 'Counterclaim']
cntvec = CountVectorizer(ngram_range=(1,1))
cntvec.fit(df_train_countclaim['discourse_text'])
unigrams = cntvec.transform(df_train_countclaim['discourse_text'])
total_words = unigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Unigram Distribution for Discourse Type Counter Claim')
ax.set_xlabel("Unigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
cntvec = CountVectorizer(ngram_range=(2,2))
cntvec.fit(df_train_countclaim['discourse_text'])
bigrams = cntvec.transform(df_train_countclaim['discourse_text'])
total_words = bigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]

fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Bigrams Distribution for Discourse Type Counter Claim')
ax.set_xlabel("Bigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
cntvec = CountVectorizer(ngram_range=(3,3))
cntvec.fit(df_train_countclaim['discourse_text'])
trigrams = cntvec.transform(df_train_countclaim['discourse_text'])
total_words = trigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]

fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Trigrams Distribution for Discourse Type Counter Claim')
ax.set_xlabel("Trigrams")
ax.set_ylabel("Frequency")

plt.show()


In [None]:
text=' ' .join([str(item) for item in df_train_countclaim['discourse_text'] ])
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
plt.figure(figsize=(15, 10), dpi=80)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

#### top words for counter claim as teacher,want,know,good,people,home etc

#### discourse type=Rebuttal

In [None]:
df_train_rebuttal = df_train[df_train.discourse_type == 'Rebuttal']
cntvec = CountVectorizer(ngram_range=(1,1))
cntvec.fit(df_train_rebuttal['discourse_text'])
unigrams = cntvec.transform(df_train_rebuttal['discourse_text'])
total_words = unigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Unigram Distribution for Discourse Type Counter Rebuttal')
ax.set_xlabel("Unigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
cntvec = CountVectorizer(ngram_range=(2,2))
cntvec.fit(df_train_rebuttal['discourse_text'])
bigrams = cntvec.transform(df_train_rebuttal['discourse_text'])
total_words = bigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]

fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Bigrams Distribution for Discourse Type Rebuttal')
ax.set_xlabel("Bigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
cntvec = CountVectorizer(ngram_range=(3,3))
cntvec.fit(df_train_rebuttal['discourse_text'])
trigrams = cntvec.transform(df_train_rebuttal['discourse_text'])
total_words = trigrams.sum(axis=0) 
ngram_freq = [(word, total_words[0, index]) for word, index in cntvec.vocabulary_.items()]
ngram_freq =sorted(ngram_freq, key = lambda x: x[1], reverse=True)
ngram_freq[1:30]

In [None]:
df_ngrams = pd.DataFrame(ngram_freq, columns = ['word' , 'freq'])
df_ngrams = df_ngrams.groupby('word').sum()['freq'].sort_values(ascending=False)
df_ngrams=df_ngrams[1:30]

fig = plt.figure(figsize=(25,10))

ax = df_ngrams.plot(kind='bar')
ax.set_title('Trigrams Distribution for Discourse Type Rebuttal')
ax.set_xlabel("Trigrams")
ax.set_ylabel("Frequency")

plt.show()

In [None]:
text=' ' .join([str(item) for item in df_train_rebuttal['discourse_text'] ])
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
plt.figure(figsize=(15, 10), dpi=80)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

#### top words for Rebuttal as think, learning,better,work,car,help etc

# BoW model

In [None]:
X = df_train['discourse_text']
y = df_train['discourse_type']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

#### tfidf BOW model

In [None]:
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
tfidf.fit(X_train)

In [None]:
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
nb = MultinomialNB()
nb.fit(X_train_tfidf,y_train)

In [None]:
preds = nb.predict(X_test_tfidf)
predicted_prob = nb.predict_proba(X_test_tfidf)
print(classification_report(y_test,preds))
confusion_matrix(y_test,preds)

In [None]:
cntvec = CountVectorizer(ngram_range=(1,1))
cntvec.fit(X_train)
X_train_cntvec = cntvec.transform(X_train)
X_test_cntvec = cntvec.transform(X_test)

In [None]:
nb = MultinomialNB()
nb.fit(X_train_cntvec,y_train)

In [None]:
preds = nb.predict(X_test_cntvec)
predicted_prob = nb.predict_proba(X_test_cntvec)
print(classification_report(y_test,preds))
confusion_matrix(y_test,preds)

#### With tfidf and countvectorizer, we can see that the recall for Claim,Evidence is good followed by Lead,Position,Concluding Statement

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train[['discourse_text']], df_train[['discourse_type']], test_size=0.2, random_state=101)

In [None]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

In [None]:
X_train = X_train.reset_index()
y_train = y_train.reset_index()
X_test = X_test.reset_index()
y_test = y_test.reset_index()

In [None]:
sentences = []
labels = []
test_sentences = []
test_labels = []

In [None]:
for index in range(1,len(X_train)):
    sentences.append(X_train['discourse_text'][index])
    labels.append(y_train['discourse_type'][index])

In [None]:
sentences[1:10], labels[1:10]

In [None]:
for index in range(1,len(X_test)):
    test_sentences.append(X_test['discourse_text'][index])
    test_labels.append(y_test['discourse_type'][index])

In [None]:
test_sentences[1:10], test_labels[1:10]

In [None]:
#  Tokenization
vocab_size=10000
oov_token = "oov"
max_length = 15
embedding_dim=16

#### tokenizer is only fit on the training sentences

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token= oov_token)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

#### training sequences

In [None]:
training_sequences = tokenizer.texts_to_sequences(sentences)
training_padded = pad_sequences(training_sequences,padding='post', maxlen=max_length)
print(training_padded[0])
print(training_padded.shape)

#### test sequences

In [None]:
testing_sequences = tokenizer.texts_to_sequences(test_sentences)
testing_padded = pad_sequences(testing_sequences,padding='post', maxlen=max_length)
print(testing_padded[0])
print(testing_padded.shape)

In [None]:
nb = MultinomialNB()
nb.fit(training_padded,labels)

In [None]:
preds = nb.predict(testing_padded)
predicted_prob = nb.predict_proba(testing_padded)
print(classification_report(test_labels,preds))
confusion_matrix(test_labels,preds)

#### The metrics is poorer compared to the tfidf bow model.