In [2]:
# Import statements:
import pandas as pd
import numpy as np

# Checking versions:
pd_version = pd.__version__
np_version = np.__version__
print(pd_version)
print(np_version)

2.1.1
1.26.0


In [3]:
# Loading data:
df = pd.read_csv("data/okcupid_profiles.csv")

In [4]:
# Shape:
df.shape

(59946, 31)

In [5]:
# Summarize function for numerical variables:
df.describe()

Unnamed: 0,age,height,income
count,59946.0,59943.0,59946.0
mean,32.34029,68.295281,20033.222534
std,9.452779,3.994803,97346.192104
min,18.0,1.0,-1.0
25%,26.0,66.0,-1.0
50%,30.0,68.0,-1.0
75%,37.0,71.0,-1.0
max,110.0,95.0,1000000.0


In [6]:
# Summary for categorical variables:
df.describe(include="object")

Unnamed: 0,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,job,...,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9
count,59946,59946,59946,54650,35551,56961,45866,53318,54266,51748,...,54458,52374,50308,48470,49409,49096,46175,47495,40721,47343
unique,5,2,3,12,18,6,3,32,217,21,...,54347,51516,48625,43520,49257,48961,43583,45548,39323,45440
top,single,m,straight,average,mostly anything,socially,never,graduated from college/university,white,other,...,.,enjoying it.,listening,my smile,ask me,family,my future,out with friends,ask me,you want to.
freq,55697,35829,51606,14652,16585,41780,37724,23959,32831,7589,...,12,61,82,529,16,6,161,89,45,200


In [7]:
# Check which columns have the least records:
df.isna().sum().sort_values(ascending=False)

offspring      35561
diet           24395
religion       20226
pets           19921
essay8         19225
drugs          14080
essay6         13771
essay9         12603
essay7         12451
essay3         11476
sign           11056
essay5         10850
essay4         10537
essay2          9638
job             8198
essay1          7572
education       6628
ethnicity       5680
smokes          5512
essay0          5488
body_type       5296
drinks          2985
speaks            50
height             3
status             0
location           0
last_online        0
income             0
orientation        0
sex                0
age                0
dtype: int64

In [8]:
# Creating a dataframe with just the essays:
essays_df = df.loc[:, ["essay0", "essay1", "essay2", "essay3", "essay4", 
                   "essay5", "essay6", "essay7", "essay8", "essay9"]]

https://www.kaggle.com/datasets/andrewmvd/okcupid-profiles/discussion/183145

Ok--so these are what each essay prompt corresponds to. In order:

1. essay0- My self summary
2. essay1- What I’m doing with my life
3. essay2- I’m really good at
4. essay3- The first thing people usually notice about me
5. essay4- Favorite books, movies, show, music, and food
6. essay5- The six things I could never do without
7. essay6- I spend a lot of time thinking about
8. essay7- On a typical Friday night I am
9. essay8- The most private thing I am willing to admit
10. essay9- You should message me if...

I'm unsure about what *order* the essays come in, but they mostly follow the pattern that essay0 is the most popular, and the latter essays are less popular. Essay #9 is the least popular, but overall--most people filled out the essay questions!

In [9]:
# Check which essays are the most to least popular:
essays_df.isna().sum().sort_values(ascending=True)

essay0     5488
essay1     7572
essay2     9638
essay4    10537
essay5    10850
essay3    11476
essay7    12451
essay9    12603
essay6    13771
essay8    19225
dtype: int64

In [10]:
# Tokenize essays into words and count the number of words
word_counts = essays_df.apply(lambda x: x.str.split().str.len())

# Calculate the average number of words per essay
average_words_per_essay = word_counts.mean()
print(average_words_per_essay.sort_values(ascending=False))

essay0    116.222226
essay4     97.270335
essay1     46.203192
essay9     34.770969
essay2     26.689214
essay6     25.787071
essay7     21.035456
essay5     21.028801
essay8     20.176052
essay3     16.643883
dtype: float64


Essay #0 had significantly more words than the other essays, and is followed by essay #4, #1, and #9. Essay #8 continues to be on the "low" side of things. 

In [11]:
from collections import Counter
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

def most_popular_words(df):
    """
    Calculates the ten most popular words in each essay column of the DataFrame, excluding stop words
    """
    top_ten_per_column = {}

    # Load stop words
    stop_words = set(stopwords.words('english'))

    for column in df.columns:
        word_freq = Counter()  # Counter to store word frequencies for each column

        # Iterate over each response in the column
        for response in df[column]:
            if isinstance(response, str):  # Check if the entry is a string
                # Tokenize the essay into words
                words = response.lower().split()  # Convert to lowercase and split into words

                # Update word frequency counts for the column, excluding stop words
                word_freq.update([word for word in words if word not in stop_words])

        # Extract top ten words for the column
        top_ten_words = [word for word, freq in word_freq.most_common(10)]

        # Add top ten words for the column to the dictionary
        top_ten_per_column[column] = top_ten_words

    return top_ten_per_column

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rohitkandala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
top_words_per_column = most_popular_words(essays_df)

for column, top_words in top_words_per_column.items():
    print(f"Top 10 words in {column}: {top_words}")

Top 10 words in essay0: ["i'm", 'love', 'like', 'new', 'good', 'people', 'life', 'enjoy', 'time', 'also']
Top 10 words in essay1: ["i'm", 'working', 'work', 'time', 'love', 'trying', 'like', 'new', 'life', 'also']
Top 10 words in essay2: ['good', "i'm", 'making', 'people', 'things', 'really', '-', 'like', 'also', 'love']
Top 10 words in essay3: ["i'm", 'people', 'smile', 'eyes', 'like', 'notice', 'really', 'look', 'hair', 'usually']
Top 10 words in essay4: ['love', 'like', "i'm", '-', 'anything', 'favorite', 'music', 'music:', 'movies:', 'books:']
Top 10 words in essay5: ['friends', 'family', '-', 'good', 'music', 'friends,', 'food', '1.', '2.', '3.']
Top 10 words in essay6: ["i'm", 'next', 'think', 'life', 'people', 'things', 'like', 'time', 'want', 'going']
Top 10 words in essay7: ['friends', 'home', 'watching', "i'm", 'going', 'friends,', 'dinner', 'friday', 'hanging', 'good']
Top 10 words in essay8: ["i'm", 'like', 'really', 'know', 'love', 'get', 'private', 'think', "i've", 'one']