In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import seaborn as sns

df1 = pd.read_csv('pharmacies.csv')
df1.columns = ['Delivery date', 'Delivery time', 'Pharmacy number', 'Pharmacy postcode', 'YOB', 'Gender', 'CNK', 'Product name', 'ATC code', 'Units', 'Price', 'Contribution']

In [None]:
df1.shape

In [None]:
df5 = df1.sample(frac=0.020212)

In [None]:
df5.shape

In [None]:
df5.isnull().sum()

In [None]:
df5.head()

In [None]:
df5['Price'].describe()

In [None]:
df5['Gender'].replace([3], 'N/A', inplace=True)
df5['Gender'].replace([1], 'Male', inplace=True)
df5['Gender'].replace([2], 'Female', inplace=True)
df5['Gender'].replace([0], 'N/A', inplace=True)

In [None]:
df5['Contribution'].describe()

In [None]:
df5.head()

In [None]:
mask = df5.Units == 0
df5.loc[mask, 'Units'] = 1

In [None]:
df5.tail()

In [None]:
df5.sort_values('Price', ascending=False)

In [None]:
plt.hist(df5['Gender']);

In [None]:
plt.figure(figsize=(16,10))
correlation_matrix=df5.corr()
sns.heatmap(correlation_matrix,annot=True);

In [None]:
df5.sort_values('YOB', ascending=False)

In [None]:
df5.sort_values('YOB', ascending=False)

In [None]:
df5['YOB'].count()

In [None]:
df5.sort_values('Contribution', ascending=False)

In [None]:
df5[df5['Gender']=='N/A']

In [None]:
df5['Product name'].str.split('X')

In [None]:
df5['Product name'].str.rsplit("X", expand=True).rename(columns={0:'Product name', 1:'Amount'})
df5

In [None]:
nonprs = df5[df5['Price'] == df5['Contribution']]

In [None]:
nonprs

In [None]:
df5[df5['Units'] == 1]

In [None]:
df5.sort_values('YOB', ascending=False)

In [None]:
df5.sort_values('Units', ascending=False)

In [None]:
df5.head()

In [None]:
df5['Prescribed'] = np.where(df5['Price']== df5['Contribution'], 'Yes', 'No')

In [None]:
df5.head()

In [None]:
kleuren = ["r", "g"]
sns.factorplot('Pharmacy postcode',data=df5,kind='count',hue='Prescribed' ,size=15, palette=kleuren)

In [None]:
df5['Year'] = pd.DatetimeIndex(df5['Delivery date']).year
df5.head()

In [None]:
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(y="Product name", data=df5, palette="Set2", order=df5['Product name'].value_counts().index[0:15])


In [None]:
df5.isnull().sum()

In [None]:
plt.figure(figsize=(14,7))
df5["Prescribed"].value_counts().plot(kind="pie",shadow=True,autopct = '%1.1f%%')
plt.title('Prescibed medication?', size=17);

In [None]:
df5.value_counts().nlargest(20).plot(kind='bar', figsize=(10,5))

In [None]:
df5['Prescribed'].value_counts().plot(kind='bar', figsize=(10,5))

In [None]:
df5['Product name'].value_counts().nlargest(10).plot(kind='bar', figsize=(10,5))

In [None]:
brk = df5.groupby('Product name')['Price'].sum()
brk.nlargest(10).plot(kind='bar')

In [None]:
brk = df5.groupby('Prescribed')['Price'].sum()
brk.nlargest(10).plot(kind='bar')

In [None]:
df5[df5["Gender"] == "Male"]["Price"].hist(alpha=0.5)
df5[df5["Gender"] == "Female"]["Price"].hist(alpha=0.5)
plt.xlim(0, 500)
plt.legend(["Male", "Female"]);

In [None]:
df5.head()

In [None]:
print(len(df5['Product name'].unique()))

In [None]:
def extractWords(text):
    words = []
    word = ''
    
    for character in text:
        if character.isspace():
            words.append(word)
            word = ''
        if character.isdigit():
            continue
        if character == ',':
            continue
        if character == '+':
            continue
        else:
            word += character
            
    return words       

In [None]:
def extractRelevant(words):
    relevant = []

    for word in words:
        word = word.replace(' ', '')
        if len(word) > 5:
            relevant.append(word)
            
    return relevant

In [None]:
words = extractWords('ELOCTA 2000IE PDR+SOLV VOOR OPL INJ 1')

In [None]:
relevant = extractRelevant(words)

In [None]:
for rel in relevant:
    rel.strip()
    print(rel)

In [None]:
print(len(relevant[1]))

In [None]:
def simplifyName(text):
    words = extractWords(text)
    relevant = extractRelevant(words)
    result = ''
    
    for st in relevant:
        result += st + " "
    
    return result

In [None]:
print(simplifyName("TOUJEO SOLOSTAR 300E/ML OPL INJ VOORGEVULDE PEN 5"))

In [None]:
df5['Simplified_name'] = df5['Product name'].apply(simplifyName)

In [None]:
df5.head(500)

In [None]:
print(len(df5['Simplified_name'].unique()))

sns.countplot(x = 'Simplified_name', data = df5)
plt.title('things')
plt.show()

In [None]:
def intTryParse(value):
    try:
        return int(value)
    except ValueError:
        return value

In [None]:
def getMG(text):
    inx = text.find('MG')
    inl = text.find('ML')
    
    if (inx == 0 and inl == 0):
        return 0
    
    stri = ''
    rep = ''
    
    if inx != 0:
        stri = text[0: inx]
    if inl != 0:
        stri = text[0: inl]
    
    det = False
    
    for character in reversed(stri):
        if character.isdigit():
            if len(rep) < 3:
                if det == False:
                    rep += character
                else:
                    break
        if character == 'X':
                det = True            
                    
    reprev = rep[::-1]   
    
    reprev.strip()
    
    if (reprev == ''):
        return 0
                    
    result = intTryParse(reprev)
    
    return result
            

In [None]:
print(getMG("AUGMENTIN 875 MG TABL 20X875MG"))

In [None]:
df5['MG'] = df5['Product name'].apply(getMG)

In [None]:
df5['MG'] = df5['MG'].astype('float64')

In [None]:
df5["Simplified_name"].astype("string")

In [None]:
df5.head(20)

In [None]:
prod = df5[df5['Simplified_name'].str.contains('DALACIN')]
prod.head()

In [None]:
df5['Simplified_name'] = df5['Simplified_name'].str.strip()

prod = df5[df5['Simplified_name'] == 'DALACIN']
prod.head()

In [None]:
import datetime

In [None]:
def date_to_nth_day(date):
    import datetime
    today = datetime.datetime.now()
    day_of_year = (date - datetime.datetime(date.year, 1, 1)).days + 1
    return day_of_year

In [None]:
df5['Delivery date'] = pd.to_datetime(df5['Delivery date'])

In [None]:
df5['DayOfTheYear'] = df5['Delivery date'].apply(date_to_nth_day)

In [None]:
pharmacy = df5[df5['Pharmacy postcode'] == 20]

pharmacy = pharmacy[pharmacy['DayOfTheYear'] < 30]

pharmacy.head(10)

In [None]:
pharmacy['Delivery date'] = pd.to_datetime(pharmacy['Delivery date'])

In [None]:
pharmacy['DayOfTheYear'] = pharmacy['Delivery date'].apply(date_to_nth_day)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = pharmacy[['DayOfTheYear', 'CNK', 'Price', 'Year', 'Pharmacy number', 'Contribution']]
y = pharmacy.Units

In [None]:
pharmacy.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

reg = LinearRegression()

reg.fit(X_train, y_train)

reg.score(X, y)

y_predict = reg.predict(X_test)
print('Mean Absolute Error: %.3f' % (mean_absolute_error(y_test, y_predict)))
print('Mean Squared Error: %.3f' % (mean_squared_error(y_test, y_predict)))
print('R^2 Score: %.3f' % (r2_score(y_test, y_predict)))
error7 = pd.DataFrame({'Current': np.array(y_test).flatten(), 'Predicted': y_predict.flatten()})
error7

In [None]:
plt.figure(figsize=(16,10))
correlation_matrix=pharmacy.corr()
sns.heatmap(correlation_matrix,annot=True);

In [None]:
def forAllMonths(clf, target, predictors, time, table):
    results = []
    
    for i in range(0, 12):
        start = i
        
        if (i != 0):
            start = i * 30
        
        end = (i + 1) * 30
        
        temp = table[table[time] > start]
        temp = temp[temp[time] < end]
    
        X = temp[predictors]
        y = temp[target].values
            
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
        
        clf.fit(X_train, y_train)
        
        score = clf.score(X, y)
        
        results.append(score)
        
    return results    

In [None]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()

months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

result = forAllMonths(reg, 'Units', ['DayOfTheYear', 'CNK', 'Price', 'Year', 'Pharmacy number', 'Contribution'], 
                      'DayOfTheYear', df5)

for i in range(0, len(result)):
    print(f'{months[i]} - accuracy: {round(result[i] * 100, 1)}%')

In [None]:
from sklearn.model_selection import train_test_split

X = df5[['Simplified_name', 'Year', 'Delivery date', 'Price', 'Pharmacy postcode', 'MG', 'Gender', 'CNK', 'Prescribed']]
y = df5.Units

X = X.apply(pd.to_numeric, errors='coerce')
y = y.apply(pd.to_numeric, errors='coerce')

X.fillna(0, inplace=True)
y.fillna(0, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
df5.head()

In [None]:
df5.dtypes

In [None]:
new = pd.read_csv("ds2020.csv.gz")

In [None]:
new.columns = ['Delivery date', 'Delivery time', 'Pharmacy number', 'Pharmacy postcode', 'YOB', 'Gender', 'CNK', 'Product name', 'ATC code', 'Units', 'Price', 'Contribution']

In [None]:
new.head()

In [None]:
new.shape

In [None]:
len(new['CNK'].unique())

In [None]:
sns.countplot(x = 'Gender', data = new)

In [None]:
plt.figure(figsize=(16,10))
corrs = new.corr()
sns.heatmap(data = corrs, annot = True)

In [None]:
kleuren = ["r", "g"]
new['Prescribed'] = np.where(new['Price']== new['Contribution'], 'Yes', 'No')
sns.factorplot('Pharmacy postcode',data=new,kind='count',hue='Prescribed' ,size=15, palette=kleuren)