Author: Anna Durbanova

Date: 15.02.2021

In [None]:
import pandas as pd
import numpy as np
import holoviews as hv
import seaborn as sns
import matplotlib.pyplot as plt
hv.extension('bokeh')

In [None]:
!pip install hvplot


In [None]:
import hvplot as hv
import hvplot.pandas
import holoviews as hv
from hvplot import hvPlot

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data=pd.read_csv("/kaggle/input/okcupid-profiles/okcupid_profiles.csv")
data.head(5)

## Missing Values

In [None]:
data.isna().sum()

In [None]:
plt.figure(figsize=(12, 5))

data.isna().sum().plot(kind="bar")
plt.xticks(rotation=50)



## Formating and Cleaning Data

In [None]:
columns=["body_type",
      "diet", "education","drinks",
      "drugs","ethnicity",
      "job","offspring","pets","religion",
      "sign", "smokes", "speaks"
     ]
data[columns]=data[columns].fillna("No info")
data["height"]=data["height"].fillna(0)
data["income"]=data["income"].replace(-1, 0)

## Combining essays into 1 profile text
data["profile text"] = (data["essay0"]
    .str
    .cat(data.iloc[:,22:-1]
    .astype(str), sep=" "))

data=data.drop(data.iloc[:,21:-1], axis=1)
data["profile text"]=data["profile text"].fillna("No info")
data["Total Words"]=data["profile text"].str.split().str.len()


data.head(5)

In [None]:
data.isna().sum() ## Double- Checking

## General Statistics and Data Types

In [None]:
data.describe()

 On average the age of profile's is 32 years old, with the income around 20 000$ (but it deviates a lot, so we can't really say that 



In [None]:
data.dtypes

5488 - empty profiles

In [None]:
data.corr().style.background_gradient()

No significant correlation between age, height and income

## 1. Age


In [None]:
data.hvplot.hist("age")

In [None]:
df=(data
 .groupby("age")
    [["sex"]]
 .count()
)
df["% of participants"]=(df["sex"]/df["sex"].sum())*100
df=df.rename(columns= {"sex":"number of participants"})
df.hvplot.bar(x='age', y='number of participants', rot=90)


In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x='age', data=data,
hue='sex',palette="rocket",
order=data['age'].value_counts().iloc[:40].index);


In [None]:
sns.catplot(x='age', data = data, kind='count', palette='pastel', hue='sex', 
            height=5, 
            aspect=2)

### Age and Total Number of Words in Profile

In [None]:
(data
 .groupby("age")
 [["Total Words"]]
 .mean()
 .sort_values(by="Total Words", ascending=False)
).hvplot.bar(x='age', y='Total Words', rot=90)


Mostly participants are between 25 and 30 years old

## 2. Status

In [None]:
df=(data
 .groupby("status")
    [["sex"]]
 .count()
)
df["% of participants"]=(df["sex"]/df["sex"].sum())*100
df=df.rename(columns= {"sex":"number of participants"})
df


In [None]:
df.hvplot.bar(x='status', y='number of participants', rot=90)


In [None]:
plt.figure(figsize=(20, 5))
sns.countplot(x='status', data=data,
hue='sex',palette="rocket",
order=data['status'].value_counts().iloc[:40].index);


In [None]:
(data
.groupby("status")
 [["Total Words"]]
 .mean()
).hvplot.bar(x="status", y="Total Words")

On Average avaivalable people write 500 words on their profile as we ll as the ones that seeing someone

## 3. Orientation

In [None]:
df=(data
 .groupby("orientation")
    [["sex"]]
 .count()
    .sort_values(by="sex", ascending=False)
)
df["% of participants"]=(df["sex"]/df["sex"].sum())*100
df=df.rename(columns= {"sex":"number of participants"})
df

In [None]:
df.hvplot.bar(x='orientation', y='number of participants', rot=90)


In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x='orientation', data=data,
hue='sex',
order=data['orientation'].value_counts().iloc[:10].index);


In [None]:
data['orientation'].value_counts()

In [None]:
(data
.groupby("orientation")
 [["Total Words"]]
 .median()
).hvplot.bar(x="orientation", y="Total Words")

Bisexual has on average the most of the words in the profile, rather than gays or straights

## 4. Body Type

In [None]:
df=(data
 .groupby("body_type")
    [["sex", "Total Words"]]
 .count()
 .sort_values(by="sex", ascending=False)
)
df["% of participants"]=(df["sex"]/df["sex"].sum())*100
df=df.rename(columns= {"sex":"number of participants"})
df

In [None]:
df.hvplot.bar(x='body_type', y='number of participants', rot=90)


In [None]:
df.hvplot.bar(x='body_type', y='Total Words', rot=90)


In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x='body_type', data=data,
hue='sex',
order=data['body_type'].value_counts().iloc[:10].index);


In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x='body_type', data=data,
hue='drinks',
order=data['body_type'].value_counts().iloc[:10].index);

## 5. Diet

In [None]:
df=(data
 .groupby("diet")
    [["sex", "Total Words"]]
 .count()
 .sort_values(by="sex", ascending=False)
)
df["% of participants"]=(df["sex"]/df["sex"].sum())*100
df=df.rename(columns= {"sex":"number of participants"})
df.hvplot.bar(x='diet', y='number of participants', rot=90)


In [None]:
df

In [None]:
plt.figure(figsize=(15, 5))
sns.countplot(x='diet', data=data,
hue='sex',
order=data['diet'].value_counts().iloc[:10].index);

## 6. Drinks

In [None]:
df=(data
 .groupby("drinks")
    [["sex", "Total Words"]]
 .count()
 .sort_values(by="sex", ascending=False)
)
df["% of participants"]=(df["sex"]/df["sex"].sum())*100
df=df.rename(columns= {"sex":"number of participants"})
df.hvplot.bar(x='drinks', y='number of participants', rot=90)

In [None]:
df.hvplot.bar(x='drinks', y='Total Words', rot=90)

In [None]:
plt.figure(figsize=(15, 5))
sns.countplot(x='drinks', data=data,
hue='sex',
order=data['drinks'].value_counts().iloc[:10].index);

In [None]:
(data
.groupby("drinks")
 [["Total Words"]]
 .median()
 .sort_values(by="Total Words", ascending=False)
).hvplot.bar(x="drinks", y="Total Words")

## 8. Drugs

In [None]:
df=(data
 .groupby("drugs")
    [["sex"]]
 .count()
 .sort_values(by="sex", ascending=False)
)
df["% of participants"]=(df["sex"]/df["sex"].sum())*100
df=df.rename(columns= {"sex":"number of participants"})
df.hvplot.bar(x='drugs', y='number of participants', rot=90)

In [None]:
(data
.groupby("drugs")
 [["Total Words"]]
 .median()
 .sort_values(by="Total Words", ascending=False)
).hvplot.bar(x="drugs", y="Total Words")

In [None]:
plt.figure(figsize=(15, 5))
sns.countplot(x='drugs', data=data,
hue='sex',
order=data['drugs'].value_counts().iloc[:10].index);

## Income

In [None]:
plt.figure(figsize=(15, 5))

sns.set_theme(style="whitegrid")

sns.boxenplot(x="income", y="age",
              data=data);

In [None]:
mask=data["income"]!=0
df=(data[mask]
 .groupby("income")
    [["sex", "Total Words"]]
 .count()
)

df=df.rename(columns= {"sex":"number of participants"})
#df.hvplot.bar(x='age', y='number of participants', rot=90)
df

In [None]:
df.hvplot.bar(x='income', y='Total Words', rot=90)


## 9. Education

In [None]:
df=(data
 .groupby("education")
    [["sex"]]
 .count()
 .sort_values(by="sex", ascending=False)
)
df["% of participants"]=(df["sex"]/df["sex"].sum())*100
df=df.rename(columns= {"sex":"number of participants"})
df.hvplot.bar(x='education', y='number of participants', rot=45)

## 10. Ethnicity

In [None]:
df=(data
 .groupby("ethnicity")
    [["sex"]]
 .count()
 .sort_values(by="sex", ascending=False)
)
df["% of participants"]=(df["sex"]/df["sex"].sum())*100
df=df.rename(columns= {"sex":"number of participants"})
df

In [None]:
plt.figure(figsize=(12, 7))
sns.countplot(y='ethnicity', data=data, hue="sex",
palette='Reds',
order = data['ethnicity'].value_counts().iloc[:10].index)
plt.show()

## 11. Height

In [None]:
df=(data
 .groupby("height")
    [["sex"]]
 .count()
 .sort_values(by="sex", ascending=False)
)
df["% of participants"]=(df["sex"]/df["sex"].sum())*100
df=df.rename(columns= {"sex":"number of participants"})
df.hvplot.bar(x='height', y='number of participants', rot=45)


In [None]:
data["height"].describe()

## 12. Gender

In [None]:
df=(data
 .groupby("sex")
 [["age"]]
 .count()
)
df["% of participants, sex"]=(df["age"]/df["age"].sum())*100
df

In [None]:
(data
 .groupby("sex")
 [["age"]]
 .count()
)


 We have 59 946 participants. 59% of them are men and 40% are women

## 13. Pets

In [None]:
plt.figure(figsize=(12, 7))
sns.countplot(y = 'pets', data=data,
hue='sex', palette='flare',
order=data['pets'].value_counts().iloc[:].index)
plt.show()

## Q2: Words in the profile


## 1. Passion

In [None]:
passion=data[data["profile text"].str.contains("passion")]

df_p=(passion
.groupby("sex")
 [["sex"]]
 .count()
)

df_p["%"]=(df_p["sex"]/df_p["sex"].sum())*100
pd.options.display.float_format = '{:.2f}%'.format
df_p

In [None]:
len(passion)

## 2. Love

In [None]:
love=data[data["profile text"].str.contains("love")]

df_l=(love
.groupby("sex")
 [["sex"]]
 .count()
)

df_l["%"]=(df_l["sex"]/df_l["sex"].sum())*100
pd.options.display.float_format = '{:.2f}%'.format
df_l

## 3. Relationship

In [None]:
rel=data[data["profile text"].str.contains("relationship")]

df_r=(rel
.groupby("sex")
 [["sex"]]
 .count()
)

df_r["%"]=(df_r["sex"]/df_r["sex"].sum())*100
pd.options.display.float_format = '{:.2f}%'.format
df_r

## 4. Serious Relationship

In [None]:
rel=data[data["profile text"].str.contains("serious relationship")]

df_r=(rel
.groupby("sex")
 [["sex"]]
 .count()
)

df_r["%"]=(df_r["sex"]/df_r["sex"].sum())*100
pd.options.display.float_format = '{:.2f}%'.format
df_r

## 5. Hookups

In [None]:
rel=data[data["profile text"].str.contains("hook*")]

df_r=(rel
.groupby("sex")
 [["sex"]]
 .count()
)

df_r["%"]=(df_r["sex"]/df_r["sex"].sum())*100
pd.options.display.float_format = '{:.2f}%'.format
df_r

## 6. Marriage

In [None]:
rel=data[data["profile text"].str.contains("marriage")]

df_r=(rel
.groupby("sex")
 [["sex"]]
 .count()
)

df_r["%"]=(df_r["sex"]/df_r["sex"].sum())*100
pd.options.display.float_format = '{:.2f}%'.format
df_r

## 7. Ons 

In [None]:
rel=data[data["profile text"].str.contains("ons")]

df_r=(rel
.groupby("sex")
 [["sex"]]
 .count()
)

df_r["%"]=(df_r["sex"]/df_r["sex"].sum())*100
pd.options.display.float_format = '{:.2f}%'.format
df_r