# Feedback Prize - baseline/EDA

In [None]:
import numpy as np 
import pandas as pd 
import os
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

1. id - ID code for essay response
2. discourse_id - ID code for discourse element
3. discourse_start - character position where discourse element begins in the essay response
4. discourse_end - character position where discourse element ends in the essay response
5. discourse_text - text of discourse element
6. discourse_type - classification of discourse element
7. discourse_type_num - enumerated class label of discourse element
8. predictionstring - the word indices of the training sample, as required for predictions

In [None]:
train_df = pd.read_csv("../input/feedback-prize-2021/train.csv")
train_df.head()

In [None]:
train_df['discourse_type'] = train_df['discourse_type'].astype('category')
train_df['discourse_type'] = train_df['discourse_type'].cat.rename_categories({'Concluding Statement':"Concluding_Statement"})
train_df["discourse_type"]

In [None]:
# how many response per 
plt.figure(figsize=(20,5))
train_df["id"].value_counts().plot()

In [None]:
from IPython.display import display, HTML
from pandas import DataFrame

Lead = "#f2748d"  # Red
Position = "#8ffbff"  # brown
Claim = "#eef51d" # Yellow
Counterclaim = "#a5ff57" #green
Rebuttal = "#fa89f4" #Pink
Evidence = "#96c9ff" #blue
Concluding_Statement = "#c391ff" #purple


def display_text(demo_id):
    train_dir = "../input/feedback-prize-2021/train"
    temp_id = demo_id + ".txt"

    with open(os.path.join(train_dir,temp_id)) as f:
        data = f.read()


    doc_df = train_df[train_df["id"]==demo_id]    

    new_data = str(data)

    for i in range(len(doc_df)):
        color = train_df["discourse_type"].iloc[i]
        selected_text = data[int(doc_df["discourse_start"].iloc[i]):int(doc_df["discourse_end"].iloc[i])]
        highlighted = f'<span style="background-color: {globals()[color]}; font-weight: bold">{selected_text}</span>'
        new_data = new_data.replace(selected_text,highlighted)


    return new_data
    
id_list = ["71259B3EA87F","149E8C278863","7C9C2DF37B67","A02E1D0BEACF"]
color_text = []
for i in id_list:
    color_text.append(display_text(i))





def left_align(df: DataFrame):
    left_aligned_df = df.style.set_properties(**{'text-align': 'left'})
    left_aligned_df = left_aligned_df.set_table_styles(
        [dict(selector='th', props=[('text-align', 'left')])]
    )
    return left_aligned_df


class color:
    S = '\033[1m' + '\033[93m'
    E = '\033[0m'
    PURPLE="\033[95m" # ok 
    CRED    = '\33[51m' # ok red
    CGREEN  = '\33[32m' #ok
    CYELLOW = '\33[33m' # ok
    CBLUE   = '\33[34m' # ok
    CVIOLET = '\33[35m' #ok pink
    CRED = '\033[91m'
    CBEIGE  = '\33[36m'
    CEND = '\033[0m'  # ok for brown
    CBLUE2   = '\33[94m'
    
    


my_colors = [Claim, Concluding_Statement, Counterclaim, Evidence, Lead, Position, Rebuttal]
print("class names with color")
print(color.CYELLOW+"Claim ",end="")
print(color.PURPLE+" Concluding ",end="")
print(color.CGREEN+"Counter",end="")
print(color.CBLUE+" Evidence ",end="")
print(color.CRED+"Lead",end="")
print(color.CBEIGE+" Position ",end="")
print(color.CVIOLET+"Rebuttal")
print(color.PURPLE+"       Statement",end="")
print(color.CGREEN+" claim")


sns.palplot(sns.color_palette(my_colors))



In [None]:
data_dict = {"id": id_list,"text":color_text}
color_df = pd.DataFrame(data_dict)
display(HTML(left_align(color_df.head()).to_html()))

In [None]:
# Feedback Prize EDA with displacy

In [None]:
np.unique(train_df["discourse_type"])

In [None]:
from tqdm import tqdm
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud


def preprocess_text(df,lable):
    text = " ".join(title for title in df[df["discourse_type"] == lable].discourse_text)
    snow = nltk.stem.SnowballStemmer('english')
    review = re.sub('[^a-zA-Z]', ' ',text)
    review = review.lower()
    review = review.split()

    review = [snow.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    
    return review

# 'Claim', 'Concluding_Statement', 'Counterclaim', 'Evidence',
#        'Lead', 'Position', 'Rebuttal'], dtype=object

plt.figure(figsize=(20,22))
plt.subplot(3,3,1)
text1 = preprocess_text(train_df,"Lead")
word_cloud2 = WordCloud(collocations = False, background_color = 'white').generate(text1)
plt.imshow(word_cloud2,interpolation='bilinear')
plt.axis('off')

plt.figure(figsize=(20,22))
plt.subplot(3,3,2)
text1 = preprocess_text(train_df,"Claim")
word_cloud2 = WordCloud(collocations = False, background_color = 'white').generate(text1)
plt.imshow(word_cloud2,interpolation='bilinear')
plt.axis('off')

plt.figure(figsize=(20,22))
plt.subplot(3,3,3)
text1 = preprocess_text(train_df,"Concluding_Statement")
word_cloud2 = WordCloud(collocations = False, background_color = 'white').generate(text1)
plt.imshow(word_cloud2,interpolation='bilinear')
plt.axis('off')

plt.figure(figsize=(20,22))
plt.subplot(3,3,4)
text1 = preprocess_text(train_df,"Counterclaim")
word_cloud2 = WordCloud(collocations = False, background_color = 'white').generate(text1)
plt.imshow(word_cloud2,interpolation='bilinear')
plt.axis('off')

plt.figure(figsize=(20,22))
plt.subplot(3,3,5)
text1 = preprocess_text(train_df,"Evidence")
word_cloud2 = WordCloud(collocations = False, background_color = 'white').generate(text1)
plt.imshow(word_cloud2,interpolation='bilinear')
plt.axis('off')

plt.figure(figsize=(20,22))
plt.subplot(3,3,6)
text1 = preprocess_text(train_df,"Position")
word_cloud2 = WordCloud(collocations = False, background_color = 'white').generate(text1)
plt.imshow(word_cloud2,interpolation='bilinear')
plt.axis('off')

plt.figure(figsize=(20,22))
plt.subplot(3,3,7)
text1 = preprocess_text(train_df,"Rebuttal")
word_cloud2 = WordCloud(collocations = False, background_color = 'white').generate(text1)
plt.imshow(word_cloud2,interpolation='bilinear')
plt.axis('off')


