In [13]:
import pandas as pd

df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,title,content,score
0,LSE accomodation for couples,Hi! \n\nI will be moving with my partner to Lo...,3
1,which accommodation do rich international stud...,where do rich students live during their studi...,0
2,Bankside or Garden Hall,Im a general course student and can't choose b...,2
3,Is the reserve list separate from an ordinary ...,I got placed on a reserve list for the Masters...,2
4,Postgraduate housing,Hello! I was just accepted into LSE for my mas...,2


In [9]:
# remove empty values
df.dropna(inplace = True)
df.shape

(703, 3)

In [15]:
#check dataframe type
df.dtypes

title      object
content    object
score       int64
dtype: object

In [17]:
#make sure the title and content have the correct type
df['title'] = df['title'].astype(str)
df['content'] = df['content'].astype(str)

In [18]:
# remove stopwords
import nltk
from nltk.corpus import stopwords
sw = stopwords.words('English')

stop_words = set(stopwords.words('english'))
df['title'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
df['content'] = df['content'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
df.head()

Unnamed: 0,title,content,score
0,LSE accomodation couples,Hi! moving partner London looking couples' acc...,3
1,accommodation rich international student stay in?,"rich students live studies? Arabs, Chinese etc",0
2,Bankside Garden Hall,Im general course student can't choose two. ad...,2
3,reserve list separate ordinary waitlist?,got placed reserve list Masters environmental ...,2
4,Postgraduate housing,Hello! accepted LSE master's. wondering accomm...,2


In [20]:
# remove the puntuation 
import string 

def remove_punctuations(text):
    if isinstance(text, str):
        translator = str.maketrans('', '', string.punctuation)
        return text.translate(translator)
    else:
        return text

df = df.fillna('')
df = df.applymap(remove_punctuations)
df.head()

Unnamed: 0,title,content,score
0,LSE accomodation couples,Hi moving partner London looking couples accom...,3
1,accommodation rich international student stay in,rich students live studies Arabs Chinese etc,0
2,Bankside Garden Hall,Im general course student cant choose two advice,2
3,reserve list separate ordinary waitlist,got placed reserve list Masters environmental ...,2
4,Postgraduate housing,Hello accepted LSE masters wondering accommoda...,2


In [34]:
# 50 most common words for the titles of posts
import collections 
all_text_title = ' '.join(df['title'])
title_counts = collections.Counter(all_text_title.split())
common_titles = title_counts.most_common(50)
print(common_titles)

[('LSE', 216), ('MSc', 71), ('Economics', 36), ('student', 35), ('vs', 34), ('offer', 34), ('application', 32), ('Finance', 29), ('year', 29), ('accommodation', 28), ('Graduate', 28), ('students', 27), ('anyone', 27), ('Student', 24), ('International', 23), ('Summer', 21), ('Accommodation', 21), ('get', 20), ('BSc', 20), ('amp', 18), ('first', 18), ('Application', 17), ('undergrad', 17), ('lse', 17), ('Undergraduate', 17), ('apply', 16), ('Masters', 16), ('School', 16), ('Political', 15), ('Msc', 15), ('Support', 15), ('got', 14), ('masters', 14), ('advice', 14), ('course', 14), ('help', 14), ('international', 13), ('GSS', 13), ('courses', 13), ('Science', 13), ('Bsc', 13), ('Hall', 12), ('entry', 12), ('economics', 12), ('Undergrad', 12), ('graduate', 11), ('admissions', 11), ('Urbanest', 11), ('Mathematics', 11), ('heard', 11)]


In [35]:
# 50 most common words for the content of posts
all_text_content = ' '.join(df['content'])
content_counts = collections.Counter(all_text_content.split())
common_contents = content_counts.most_common(50)
print(common_contents)

[('LSE', 517), ('would', 260), ('Im', 247), ('get', 196), ('year', 183), ('student', 178), ('know', 173), ('anyone', 164), ('Hi', 155), ('offer', 148), ('nan', 148), ('application', 145), ('course', 132), ('like', 129), ('MSc', 128), ('students', 119), ('program', 107), ('I’m', 105), ('Thanks', 103), ('one', 96), ('looking', 93), ('wondering', 92), ('applied', 90), ('got', 89), ('apply', 89), ('want', 87), ('experience', 85), ('need', 84), ('Hello', 81), ('courses', 81), ('degree', 81), ('time', 80), ('help', 80), ('first', 79), ('also', 79), ('currently', 78), ('really', 75), ('still', 74), ('international', 74), ('2', 73), ('school', 73), ('good', 72), ('received', 69), ('much', 68), ('Ive', 66), ('personal', 66), ('Economics', 65), ('people', 65), ('Thank', 65), ('London', 64)]


In [37]:
# 50 most common words for the titles of the highest scoring 50 posts 
highest_scoring = df.sort_values(['score'], ascending = False).groupby('score').head(50)
top_text_title = ' '.join(highest_scoring['title'])
top_title_counts = collections.Counter(top_text_title.split())
top_common_titles = top_title_counts.most_common(50)
print(top_common_titles)

[('LSE', 102), ('MSc', 38), ('vs', 21), ('Graduate', 19), ('application', 19), ('offer', 16), ('students', 14), ('accommodation', 13), ('student', 13), ('Political', 11), ('Economics', 11), ('Undergraduate', 11), ('anyone', 11), ('Student', 11), ('Support', 10), ('Masters', 10), ('get', 10), ('undergrad', 9), ('Finance', 9), ('London', 8), ('Summer', 8), ('advice', 8), ('International', 8), ('Science', 8), ('masters', 8), ('GSS', 8), ('Scheme', 8), ('Accommodation', 8), ('first', 8), ('year', 8), ('UK', 8), ('online', 8), ('lse', 8), ('2023', 8), ('Economy', 7), ('Msc', 7), ('Undergrad', 6), ('UCL', 6), ('Help', 6), ('School', 6), ('Sidney', 6), ('Webb', 6), ('waiting', 6), ('help', 6), ('Applying', 6), ('course', 6), ('History', 6), ('group', 6), ('apply', 6), ('BSc', 6)]


In [39]:
# 50 most common words for the content of the highest scoring 50 posts
top_text_content = ' '.join(highest_scoring['content'])
top_content_counts = collections.Counter(top_text_content.split())
top_common_contents = top_content_counts.most_common(50)
print(top_common_contents)

[('LSE', 282), ('Im', 135), ('would', 118), ('get', 110), ('year', 98), ('know', 96), ('student', 94), ('Hi', 78), ('like', 77), ('students', 75), ('application', 73), ('anyone', 70), ('MSc', 68), ('offer', 67), ('Thanks', 60), ('nan', 57), ('course', 57), ('I’m', 56), ('program', 50), ('really', 48), ('looking', 48), ('got', 47), ('help', 47), ('one', 47), ('school', 47), ('first', 46), ('want', 44), ('wondering', 43), ('need', 42), ('degree', 42), ('also', 41), ('experience', 41), ('2', 40), ('Hello', 40), ('apply', 40), ('courses', 40), ('much', 39), ('time', 38), ('London', 38), ('international', 37), ('applied', 36), ('still', 35), ('Ive', 35), ('currently', 34), ('university', 34), ('received', 34), ('could', 34), ('good', 34), ('people', 33), ('advice', 33)]


In [40]:
import pickle

# Save the list to a file
with open('words.pkl', 'wb') as file:
    pickle.dump(common_titles, file)
    pickle.dump(common_contents, file)
    pickle.dump(top_common_titles, file)
    pickle.dump(top_common_contents, file)