In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import datetime
from collections import Counter

In [None]:
mpl.style.use(['ggplot'])

In [None]:
df = pd.read_pickle("data/new_all.pkl")

In [None]:
def how_long(y, m, d):
    return (datetime.datetime.now() - datetime.datetime(y, m, d)).days

In [None]:
df["mean_donation"] = df.dotations_per_month / df.patrons
df.loc[df['patrons'] == 0, "mean_donation"] = np.NaN

In [None]:
df['profile_age_days'] = df.apply(lambda row: how_long(row['registration_year'], row['registration_month'], row['registration_day']), axis = 1)

df['registration_date'] = df.apply(lambda row: pd.to_datetime(datetime.date(row['registration_year'],
                                                                            row['registration_month'],
                                                                            row['registration_day'])), axis=1)

In [None]:
df.head(10)

In [None]:
df_per_year = df.value_counts("registration_year").reset_index()
df_per_year.columns = ['registration_year', 'count']

sns.barplot(data=df_per_year, x="registration_year", y="count")

In [None]:
df_2020_month = df.value_counts(['registration_year', "registration_month"]).reset_index()
df_2020_month.columns = ['registration_year', 'registration_month', 'count']

sns.barplot(data=df_2020_month, x="registration_month", y="count", hue='registration_year')

In [None]:
df.sort_values("patrons", ascending=False).head(10)[['name', 'patrons']]

In [None]:
df['patrons'].describe()

In [None]:
sns.distplot(df['patrons'])

In [None]:
sns.distplot(df[df['patrons'].between(1, 20)]['patrons'], bins=20)

In [None]:
df[df['dotations_per_month'] != 0]['dotations_per_month'].describe()

In [None]:
sns.distplot(df[df["dotations_per_month"] != 0]["dotations_per_month"], bins=100)
# plt.xscale('log')

In [None]:
sns.distplot(df[df["dotations_per_month"].between(1, 150)]["dotations_per_month"], bins=100)

In [None]:
df.sort_values("dotations_per_month", ascending=False).head(10)[['name', 'dotations_per_month']]

In [None]:
df.sort_values("total_dotations", ascending=False).head(10)[['name', 'total_dotations']]

In [None]:
df[df['total_dotations'] != 0]['total_dotations'].describe().astype(int)

In [None]:
df.sort_values('patrons', ascending=False).head(10)[['name', 'patrons']]

In [None]:
sns.distplot(df.mean_donation)

In [None]:
df[(df['mean_donation'] > 0) & ~pd.isna(df['mean_donation'])]['mean_donation'] .describe()

In [None]:
df[df.mean_donation >= 100].sort_values('mean_donation', ascending=False).head(10)[['name', 'mean_donation']]

In [None]:
sns.distplot(df[df["mean_donation"] < 100]["mean_donation"], bins=50)

In [None]:
sns.pairplot(df)

In [None]:
sns.scatterplot(data=df[df.patrons < 500], x='patrons', y='dotations_per_month')

In [None]:
sns.scatterplot(data=df[df["dotations_per_month"].between(1, 1000)], x='profile_age_days', y='dotations_per_month')

In [None]:
sns.scatterplot(data=df[df["total_dotations"] <= 100000], x='profile_age_days', y='total_dotations')

In [None]:
sns.scatterplot(data=df[df['patrons'] < 150], x='profile_age_days', y='patrons')

In [None]:
sns.scatterplot(data=df[df['mean_donation'] < 150], x='profile_age_days', y='mean_donation')

In [None]:
# najpopularniejsze tagi
df['tags_list'] = df['tags'].apply(lambda t: t.split("|"))


tag_list = [i for t in df['tags_list'] for i in t ]

tag_list_coll = dict(Counter(tag_list))

popular_tags = pd.DataFrame.from_dict(dict(sorted(tag_list_coll.items(), key=lambda x: x[1], reverse=True)), orient='index').reset_index()
popular_tags.columns = ['tag', 'count']

In [None]:
sns.barplot(data=popular_tags.nlargest(30, 'count'), x='count', y='tag')

In [None]:
# Pandas >= 0.25
df_tags = df.explode('tags_list')
df_tags_agg = df_tags.groupby('tags_list').aggregate({"patrons": "mean",
                                       "dotations_per_month": "mean",
                                       "total_dotations": "mean",
                                       "mean_donation": "mean"}).reset_index()


In [None]:
df_tags_agg.nlargest(10, 'patrons')[['tags_list', 'patrons']]

In [None]:
df_tags_agg.nlargest(10, 'dotations_per_month')[['tags_list', 'dotations_per_month']]

In [None]:
df_tags_agg.nlargest(10, 'total_dotations')[['tags_list', 'total_dotations']]

In [None]:
df_tags_agg.nlargest(10, 'mean_donation')[['tags_list', 'mean_donation']]

In [None]:
6.5/100 * np.sum(df.total_dotations)

In [None]:
partonite_zarobki = df[['registration_year', 'total_dotations']].groupby('registration_year').aggregate("sum").reset_index()
partonite_zarobki['prowizja'] = round(6.5/100 * partonite_zarobki['total_dotations'], 2)
partonite_zarobki

In [None]:
sns.barplot(data=partonite_zarobki, x='registration_year', y='prowizja')

In [None]:
# łącznie
round(np.sum(partonite_zarobki['prowizja']))

In [None]:
# miesięcznie
round(np.sum(partonite_zarobki['prowizja'])/(4*12))