possible directions: remove empty values, remove stop words, find most common words

In [2]:
import pandas as pd

df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,title,content,score
0,LSE accomodation for couples,Hi! \r\n\r\nI will be moving with my partner t...,3
1,which accommodation do rich international stud...,where do rich students live during their studi...,0
2,Bankside or Garden Hall,Im a general course student and can't choose b...,2
3,Is the reserve list separate from an ordinary ...,I got placed on a reserve list for the Masters...,2
4,Postgraduate housing,Hello! I was just accepted into LSE for my mas...,2


In [3]:
# remove stopwords
import nltk
from nltk.corpus import stopwords
sw = stopwords.words('English')

stop_words = set(stopwords.words('english'))
df['title'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
df['content'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
df.head()

Unnamed: 0,title,content,score
0,LSE accomodation couples,LSE accomodation couples,3
1,accommodation rich international student stay in?,accommodation rich international student stay in?,0
2,Bankside Garden Hall,Bankside Garden Hall,2
3,reserve list separate ordinary waitlist?,reserve list separate ordinary waitlist?,2
4,Postgraduate housing,Postgraduate housing,2


In [4]:
# remove empty values
df.dropna(inplace = True)
df.head()

Unnamed: 0,title,content,score
0,LSE accomodation couples,LSE accomodation couples,3
1,accommodation rich international student stay in?,accommodation rich international student stay in?,0
2,Bankside Garden Hall,Bankside Garden Hall,2
3,reserve list separate ordinary waitlist?,reserve list separate ordinary waitlist?,2
4,Postgraduate housing,Postgraduate housing,2


In [5]:
# remove the puntuation 
import string 
nltk.download('punkt')

def remove_punctuations(text):
    if isinstance(text, str):
        translator = str.maketrans('', '', string.punctuation)
        return text.translate(translator)
    else:
        return text

df = df.fillna('')
df = df.applymap(remove_punctuations)
df.head(50)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\86157\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,title,content,score
0,LSE accomodation couples,LSE accomodation couples,3
1,accommodation rich international student stay in,accommodation rich international student stay in,0
2,Bankside Garden Hall,Bankside Garden Hall,2
3,reserve list separate ordinary waitlist,reserve list separate ordinary waitlist,2
4,Postgraduate housing,Postgraduate housing,2
5,PPE entry requirements,PPE entry requirements,1
6,apply graduate admissions best chance necessar...,apply graduate admissions best chance necessar...,1
7,LSE Equivalent Linear Algebra,LSE Equivalent Linear Algebra,2
8,GSS Application,GSS Application,0
9,Urbanest Westminster,Urbanest Westminster,2


In [6]:
# 100 most common words for the titles of posts
import collections 
all_text_title = ' '.join(df['title'])
word_counts = collections.Counter(all_text_title.split())
print(word_counts.most_common(100))

[('LSE', 216), ('MSc', 71), ('Economics', 36), ('student', 35), ('vs', 34), ('offer', 34), ('application', 32), ('Finance', 29), ('year', 29), ('accommodation', 28), ('Graduate', 28), ('students', 27), ('anyone', 27), ('Student', 24), ('International', 23), ('Summer', 21), ('Accommodation', 21), ('get', 20), ('BSc', 20), ('amp', 18), ('first', 18), ('Application', 17), ('undergrad', 17), ('lse', 17), ('Undergraduate', 17), ('apply', 16), ('Masters', 16), ('School', 16), ('Political', 15), ('Msc', 15), ('Support', 15), ('got', 14), ('masters', 14), ('advice', 14), ('course', 14), ('help', 14), ('international', 13), ('GSS', 13), ('courses', 13), ('Science', 13), ('Bsc', 13), ('Hall', 12), ('entry', 12), ('economics', 12), ('Undergrad', 12), ('graduate', 11), ('admissions', 11), ('Urbanest', 11), ('Mathematics', 11), ('heard', 11), ('Course', 11), ('UK', 11), ('2', 11), ('London', 11), ('Bankside', 10), ('Economy', 10), ('UCL', 10), ('late', 10), ('2023', 10), ('online', 10), ('Admission

In [7]:
# 100 most common words for the content of posts
all_text_content = ' '.join(df['content'])
word_counts = collections.Counter(all_text_content.split())
print(word_counts.most_common(100))

[('LSE', 216), ('MSc', 71), ('Economics', 36), ('student', 35), ('vs', 34), ('offer', 34), ('application', 32), ('Finance', 29), ('year', 29), ('accommodation', 28), ('Graduate', 28), ('students', 27), ('anyone', 27), ('Student', 24), ('International', 23), ('Summer', 21), ('Accommodation', 21), ('get', 20), ('BSc', 20), ('amp', 18), ('first', 18), ('Application', 17), ('undergrad', 17), ('lse', 17), ('Undergraduate', 17), ('apply', 16), ('Masters', 16), ('School', 16), ('Political', 15), ('Msc', 15), ('Support', 15), ('got', 14), ('masters', 14), ('advice', 14), ('course', 14), ('help', 14), ('international', 13), ('GSS', 13), ('courses', 13), ('Science', 13), ('Bsc', 13), ('Hall', 12), ('entry', 12), ('economics', 12), ('Undergrad', 12), ('graduate', 11), ('admissions', 11), ('Urbanest', 11), ('Mathematics', 11), ('heard', 11), ('Course', 11), ('UK', 11), ('2', 11), ('London', 11), ('Bankside', 10), ('Economy', 10), ('UCL', 10), ('late', 10), ('2023', 10), ('online', 10), ('Admission

In [8]:
# 100 most common words for the titles of the highest scoring 50 posts 
highest_scoring = df.sort_values(['score'], ascending = False).groupby('score').head(50)
all_text_title = ' '.join(highest_scoring['title'])
word_counts = collections.Counter(all_text_title.split())
print(word_counts.most_common(100))

[('LSE', 102), ('MSc', 38), ('vs', 21), ('Graduate', 19), ('application', 19), ('offer', 16), ('students', 14), ('accommodation', 13), ('student', 13), ('Political', 11), ('Economics', 11), ('Undergraduate', 11), ('anyone', 11), ('Student', 11), ('Support', 10), ('Masters', 10), ('get', 10), ('undergrad', 9), ('Finance', 9), ('London', 8), ('Summer', 8), ('advice', 8), ('International', 8), ('Science', 8), ('masters', 8), ('GSS', 8), ('Scheme', 8), ('Accommodation', 8), ('first', 8), ('year', 8), ('UK', 8), ('online', 8), ('lse', 8), ('2023', 8), ('Economy', 7), ('Msc', 7), ('Undergrad', 6), ('UCL', 6), ('Help', 6), ('School', 6), ('Sidney', 6), ('Webb', 6), ('waiting', 6), ('help', 6), ('Applying', 6), ('course', 6), ('History', 6), ('group', 6), ('apply', 6), ('BSc', 6), ('Bsc', 6), ('financial', 6), ('Housing', 6), ('management', 6), ('High', 5), ('Holborn', 5), ('got', 5), ('Online', 5), ('Accomodation', 5), ('graduate', 5), ('entry', 5), ('someone', 5), ('Urbanest', 5), ('social',

In [9]:
# 100 most common words for the content of the highest scoring 50 posts
all_text_title = ' '.join(highest_scoring['content'])
word_counts = collections.Counter(all_text_title.split())
print(word_counts.most_common(100))

[('LSE', 102), ('MSc', 38), ('vs', 21), ('Graduate', 19), ('application', 19), ('offer', 16), ('students', 14), ('accommodation', 13), ('student', 13), ('Political', 11), ('Economics', 11), ('Undergraduate', 11), ('anyone', 11), ('Student', 11), ('Support', 10), ('Masters', 10), ('get', 10), ('undergrad', 9), ('Finance', 9), ('London', 8), ('Summer', 8), ('advice', 8), ('International', 8), ('Science', 8), ('masters', 8), ('GSS', 8), ('Scheme', 8), ('Accommodation', 8), ('first', 8), ('year', 8), ('UK', 8), ('online', 8), ('lse', 8), ('2023', 8), ('Economy', 7), ('Msc', 7), ('Undergrad', 6), ('UCL', 6), ('Help', 6), ('School', 6), ('Sidney', 6), ('Webb', 6), ('waiting', 6), ('help', 6), ('Applying', 6), ('course', 6), ('History', 6), ('group', 6), ('apply', 6), ('BSc', 6), ('Bsc', 6), ('financial', 6), ('Housing', 6), ('management', 6), ('High', 5), ('Holborn', 5), ('got', 5), ('Online', 5), ('Accomodation', 5), ('graduate', 5), ('entry', 5), ('someone', 5), ('Urbanest', 5), ('social',

In [27]:
# Save the 100 most common words
# Specify the file name
file_name = "words.txt"

# Open the file in write mode
with open(file_name, "w") as file:
    # Loop through the list and write each item to a new line in the file
    for item, count in word_counts.most_common(100):
        file.write(item  + ' ' + str(count) + '\n')