In [97]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.font_manager as fm
import matplotlib.ticker as mticker
from plotnine import ggplot, aes, geom_line, geom_bar, theme, element_blank, element_line, element_text, coord_flip, scale_y_continuous, labs, scale_fill_manual, ggsave, scale_x_continuous
from mizani.formatters import number_format

import warnings


In [98]:
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [99]:

colors = mcolors.TABLEAU_COLORS

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Helvetica']


In [100]:


os.chdir('C:/Repos/payment_method_case_if') 

imp = pd.read_excel("PaymentCasefromFinland_data.xlsx")

In [101]:


#sanity check 'grouped values': har jag förstått datan rätt. en rad per kombination av variabelutfall och 'customers' anger hur många med den kombinationen?
sc_grouped = imp.drop(columns = ['customers']).drop_duplicates()

print(len(sc_grouped) - len(imp)) # finns bara distinkta kombinationer av värden

grunddata = (imp
             .assign(Users = lambda x: np.where(x['FeatureInvestigated'] == 1, 'Användare', 'Övriga'),
                     Rating = lambda x: x['Rating'].str.title())
             )


0


In [102]:

def custom_theme(): 
    return (theme(
        panel_background=element_blank(),
        axis_ticks=element_blank(),
        panel_grid_major_y=element_line(color="grey"),
        axis_title=element_text(size=14),
         text = element_text(family='Carlito', size = 12)
    ))

def leaning_text():
    return (theme(
        axis_text_x=element_text(angle=45, hjust=1)
    ))

andel_lab = 'Andel i %'


In [103]:

#Slide 1: Vilka åldrar använder metoden? KODGRANSKAD CHECK
by_age_df = (
    grunddata.query('FeatureInvestigated == 1')
    .groupby(['AgeClass'])
    .agg(Antal = ('customers', 'sum'))
    .reset_index()
    .assign(Andel = lambda x: x['Antal'].transform(lambda y:100*y/y.sum()))
    .assign(AgeClass = lambda x: pd.Categorical(x['AgeClass'],
                                                          categories=['Missing', 'Group1', 'Group2', 'Group3', 'Group4'],
                                                          ordered=True))
    )

plt_age = (ggplot(by_age_df) +
           aes(x='AgeClass', y='Andel') +
           geom_bar(stat = 'identity', fill = colors['tab:purple']) +
           custom_theme() +
           theme(axis_title_x = element_blank())+
          labs(y=andel_lab) +
           leaning_text()
          )


In [104]:
#slide 2 påverkar rating benägenheten? KODGRANSKAD CHECK

rating_df = (
    grunddata.groupby(['Rating', 'Users'])
    .agg(Antal = ('customers', 'sum'))
    .reset_index()
    .assign(Andel = lambda x: x.groupby(['Users'])['Antal'].transform(lambda y:100*y/y.sum()))
    .assign(Rating = lambda x:pd.Categorical(x['Rating'],
                                             categories=['Missing', 'Ok', 'Good', 'Star']))
    .assign(Andel = lambda x:np.where(x['Users'] == 'Övriga', -x['Andel'], x['Andel']))
)

plt_rating = (ggplot(rating_df) +
              aes(x='Rating', y='Andel', fill='Users') + 
              geom_bar(stat='identity', position='identity') +
              leaning_text() +
              coord_flip() +
              custom_theme() +
              theme(axis_title_x = element_blank(),
                   legend_title = element_blank()) +
              scale_y_continuous(labels=lambda x: [abs(v) for v in x]) +
              labs(y=andel_lab) +
              scale_fill_manual(values = ['tab:pink', 'tab:blue']) 
             )

In [105]:
# slide 3: cube rating bland användare och övriga KODGRANSKAD CHECK

cube_level_df = (
    grunddata.groupby(['Users', 'cube_level'])
    .agg(Antal = ('customers', 'sum'))
    .reset_index()
    .assign(Andel = lambda x: x.groupby('Users')['Antal'].transform(lambda y: 100*y/y.sum()))
    .assign(cube_level = lambda x: pd.Categorical(x['cube_level'],
                                                  categories=['?', 0, 1, 2, 3]))
    .assign(Andel = lambda x: np.where(x['Users'] == 'Övriga', -x['Andel'], x['Andel']))
    )

plt_cube_level = (ggplot(cube_level_df) +
              aes(x='cube_level', y='Andel', fill='Users') +
              geom_bar(stat='identity', position ='identity') +
              coord_flip() +
              scale_fill_manual(values=['tab:pink', 'tab:blue']) +
             scale_y_continuous(labels=lambda x: [abs(v) for v in x]) +  #
              custom_theme() +
              theme(legend_title = element_blank()) +
                  labs(x='Cube level', y=andel_lab)
)

#Beräknar snitt. oklart om snittet är värt något..
cube_level_df_weight = cube_level_df.query("cube_level != '?'")
weighted_mean = cube_level_df_weight.groupby('Users').apply(lambda x: np.average(x['cube_level'], weights = x['Andel']))

print(weighted_mean)

Users
Användare    1.748789
Övriga       1.544067
dtype: float64


In [106]:
#dataframe 4: contact KODGRANSKAD CHECK

email_df = (
    grunddata.groupby(['Users'])
    .apply(lambda x: (x['contact_info_edm']*x['customers']).sum() / x['customers'].sum())
    .reset_index(name='Email')
)

telefon_df = (
    grunddata.groupby(['Users'])
    .apply(lambda x: (x['contact_info_tm']*x['customers']).sum() /x['customers'].sum())
    .reset_index(name='Telefon')
          )
                       
contact_df = (
    pd.merge(email_df, telefon_df, on='Users')
    .melt(id_vars=['Users'], value_vars=['Email', 'Telefon'], var_name='Kontaktinfo', value_name='Andel')
    .assign(Andel = lambda x: x['Andel']*100)
)

plt_contact = (
    ggplot(contact_df) +
    aes(x='Kontaktinfo', y='Andel', fill='Users') +
    geom_bar(stat='identity', position='dodge') +
    custom_theme() +
    theme(legend_title = element_blank(),
         axis_title_x = element_blank()) +
    scale_fill_manual(values = ['tab:pink', 'tab:blue']) +
    labs(y=andel_lab) +
    leaning_text()
    )

In [107]:
#duration KODGRANSKNING KLAR

duration_df = (
    grunddata.groupby(['DurationClass', 'Users'])
    .agg(Antal = ('customers', 'sum'))
    .reset_index()
    .assign(Andel = lambda x: x.groupby('Users')['Antal'].transform(lambda y: 100*y/y.sum()),
           Duration = lambda x: pd.Categorical(x['DurationClass']
                .astype(str)
            .str.replace('to', 'till ', regex=False)
            .str.replace(r'\s+', ' ', regex=True),
           categories=['0', '1', '2', '3', '4', '5 till 6', '7 till 9', '10 till 14', '15 till 19', '20 till 29', '30+', 'Missing'],
           ordered=True))
    .assign(Andel = lambda x: np.where(x['Users'] == 'Övriga', -x['Andel'], x['Andel']))
    )

plt_duration = (
    ggplot(duration_df, aes(x='Duration', y='Andel', fill='Users')) +
    geom_bar(stat='identity', position='identity') +
    coord_flip() +
    custom_theme() +
    theme(legend_title = element_blank()
          ) +
    labs(x = 'Antal år som kund', y = andel_lab) +
    scale_fill_manual(values=['tab:pink', 'tab:blue'])
    )


In [108]:
# products KODGRANSKAT CHECK

anvandare = (
    grunddata.query('FeatureInvestigated==1')
)

product1 = (
    anvandare.groupby('product1', as_index=False)
    .agg({'customers': lambda x: x.sum(skipna=True)})
    .rename(columns={'product1':'value', 'customers': 'product1'})
)

product2 = (
    anvandare.groupby('product2', as_index=False)
    .agg({'customers': lambda x: x.sum(skipna=True)})
    .rename(columns={'product2':'value', 'customers': 'product2'})
)

product3 = (
    anvandare.groupby('product3', as_index=False)
    .agg({'customers': lambda x: x.sum(skipna=True)})
    .rename(columns={'product3':'value', 'customers': 'product3'})
)

product4 = (
    anvandare.groupby('product4', as_index=False)
    .agg({'customers': lambda x: x.sum(skipna=True)})
    .rename(columns={'product4':'value', 'customers':'product4'})
)

merged_df = pd.merge(product1, product2, on='value', how='outer')

merged_df = pd.merge(merged_df, product3, on='value', how='outer')

merged_df = pd.merge(merged_df, product4, on='value', how='outer')

merged_df.columns = ['value', 'product1', 'product2', 'product3', 'product4']

df_long = pd.melt(merged_df, 
                  id_vars=['value'],  
                  value_vars=['product1', 'product2', 'product3', 'product4'],  # Columns to unpivot
                  var_name='product', 
                  value_name='customers') 

df_long['value'] = pd.Categorical(df_long['value'], categories=[2, 1, 0], ordered=True)

plt_products = (ggplot(df_long, aes(x='product', y='customers', fill='factor(value)')) +
                   geom_bar(stat='identity', position='stack') +
                    custom_theme() +
                   theme(legend_title = element_blank()) +
                scale_fill_manual(values = ['tab:olive', 'tab:red', 'tab:orange']) +
                labs(y='Antal')
               )


In [109]:
plots = {'plt_age':plt_age,
        'plt_rating':plt_rating,
        'plt_contact':plt_contact,
        'plt_duration':plt_duration,
        'plt_cube_level':plt_cube_level,
        'plt_products':plt_products}



In [110]:
#kika på plots

#for plot in plots.items():
 #   print(plot)

In [111]:
# export

#for name, plot in plots.items():
 #   filename = f"plots/{name}.pdf"
  #  ggsave(plot, filename=filename, dpi=300)
