In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from pprint import pprint
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
df = pd.read_csv('/kaggle/input/sentenced-to-death-last-words/deathrow_text.csv')
print(df.columns)


In [None]:
#CLEANING THE DATAFRAME
#drop unnecessary columns
df1 = df.drop(columns=['Unnamed: 0','inmate_info_link','lastwords_info_link'])

#fill empty rows in lastwords with word blank
df1['lastwords'] = df1['lastwords'].fillna('blank')

#clean race column by removing extra spaces
df1['race'] = df['race'].str.replace(' ','')

#create a new column for category and populate it as "Not Blank" by default
df1['category'] = 'Statement Given'

#In column category if no statement given, mark them as blank. For rest, mark as non-blank
#source: https://kanoki.org/2019/07/17/pandas-how-to-replace-values-based-on-conditions/
df1['category'] = np.where((df1.lastwords == 'No statement given.'), "No Statement", df1.category )
df1['category'] = np.where((df1.lastwords == 'blank'), "No Statement", df1.category)
df1['category'] = np.where((df1.lastwords == 'This inmate declined to make a last statement.'), "No Statement", df1.category)

#frequency of words
freq = pd.DataFrame(columns=["Word","Freq"])
freq = pd.Series(' '.join(df1['lastwords']).split()).value_counts()[:]
freq = pd.Series.to_frame(freq)
print(freq)
freq.to_csv('word_frequency.csv')

#Based on word frequency, we will need to remove common words to see a meaningful output

In [None]:
#TEXT CLEANING
#Convert to lowercase
def Remove_uppercase(df, column_name):
    df[column_name+'_lower'] = df[column_name].str.lower()
    return df
#Remove special characters
def Remove_special_characters(df, column_name):
    df[column_name+'_nospecialchar'] = df[column_name].map(str).map(lambda x: re.sub(r'\W+', ' ', x))
    return  df

#Remove commonly occuring words + specific words that may not be generic enough
#source: adding custom stop words: https://stackoverflow.com/questions/5511708/adding-words-to-nltk-stoplist
def Remove_commonwords(df, column_name):
    stop = nltk.corpus.stopwords.words('english')
    newStopWords = ['irene', 'okay', 'know']
    stop.extend(newStopWords)
    df[column_name+'_NoCommonWords'] = df[column_name].apply(lambda x: " ".join([word for word in x.split() if word not in stop]))
    return df


Remove_special_characters(df1, 'lastwords')
Remove_uppercase(df1,'lastwords_nospecialchar')
Remove_commonwords(df1,'lastwords_nospecialchar_lower')
print(df1.columns)


In [None]:
#Create wordcloud on lastwords. Remove rows where "statement not given"
temp = df1[df1['category'] != 'No Statement']
wordcloud2 = WordCloud().generate(' '.join(temp['lastwords_nospecialchar_lower_NoCommonWords']))
plt.imshow(wordcloud2)
plt.axis("off")
plt.show()

#Top 5 words
freq = pd.DataFrame(columns=["Word","Freq"])
freq = pd.Series(' '.join(temp['lastwords_nospecialchar_lower_NoCommonWords']).split()).value_counts()[:5]
freq = pd.Series.to_frame(freq)
print(freq)

In [None]:
#Create wordcloud on lastwords. Remove rows where "statement not given" and race = Black
temp = df1[df1['category'] != 'No Statement']
temp = df1[df1['race'] == 'Black']

wordcloud2 = WordCloud().generate(' '.join(temp['lastwords_nospecialchar_lower_NoCommonWords']))
plt.imshow(wordcloud2)
plt.axis("off")
plt.show()

#Top 5 words
freq = pd.DataFrame(columns=["Word","Freq"])
freq = pd.Series(' '.join(temp['lastwords_nospecialchar_lower_NoCommonWords']).split()).value_counts()[:5]
freq = pd.Series.to_frame(freq)
print(freq)

In [None]:
#Create wordcloud on lastwords. Remove rows where "statement not given" and race = Black
temp = df1[df1['category'] != 'No Statement']
temp = df1[df1['race'] == 'White']

wordcloud2 = WordCloud().generate(' '.join(temp['lastwords_nospecialchar_lower_NoCommonWords']))
plt.imshow(wordcloud2)
plt.axis("off")
plt.show()

#Top 5 words
freq = pd.DataFrame(columns=["Word","Freq"])
freq = pd.Series(' '.join(temp['lastwords_nospecialchar_lower_NoCommonWords']).split()).value_counts()[:5]
freq = pd.Series.to_frame(freq)
print(freq)

In [None]:
#Create wordcloud on lastwords. Remove rows where "statement not given" and race = Black
temp = df1[df1['category'] != 'No Statement']
temp = df1[df1['race'] == 'Hispanic']

wordcloud2 = WordCloud().generate(' '.join(temp['lastwords_nospecialchar_lower_NoCommonWords']))
plt.imshow(wordcloud2)
plt.axis("off")
plt.show()

#Top 5 words
freq = pd.DataFrame(columns=["Word","Freq"])
freq = pd.Series(' '.join(temp['lastwords_nospecialchar_lower_NoCommonWords']).split()).value_counts()[:5]
freq = pd.Series.to_frame(freq)
print(freq)

In [None]:
#No statements by race
#source: https://stackoverflow.com/questions/41119623/pandas-pivot-table-sort-values-by-columns
temp=pd.pivot_table(df1, index=['race'], columns= ['category'], values=['id'], aggfunc='count' )
temp = temp.reindex(temp['id'].sort_values(by='Statement Given', ascending=False).index)
print(temp)

In [None]:
#descriptive stats by age
#source: https://pbpython.com/pandas-qcut-cut.html
labels = ['1_21 to 30', '2_31 to 40', '3_41 to 50', '4_51 to 60', '5_61 to 70', '6_71 and more' ]
bins = [0,30, 40,50,60,70,100]
df1['age_bins'] = pd.cut(df1['age'], bins=bins, labels=labels)

#Pivot by age bins and statement. Ignoring race = Other as number is very low. Showing percentage of rows
temp = pd.crosstab(df1.age_bins, df1.category, values=df1.id, aggfunc = sum, normalize='index')
print("View by age bins Vs Statement given/Not Given \n")
print(temp)
print('\n')


#Pivot by age bins, race and statement. Ignoring race = Other as number is very low. Showing percentage of rows
temp = pd.crosstab([df1.age_bins, df1.race], df1.category, values=df1.id, aggfunc = sum, normalize='index')
print("View by age bins, race Vs Statement given/Not given \n")
print(temp)