In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression
%matplotlib inline
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#Question 1: Are there any null values or outliers? How will you wrangle/handle them?
#Answer 1: Country SP is not a country code. I am theorizing it MIGHT be spain but will keep SP since there is no directin to find out.

#Question 2: Are there any variables that warrant transformations?
#Answer 2: "Income" is treated as an object and cannot be used with df.describe() since it is not a float and is treated as an object. Income also has spaces so I will use str.strip to get rid of it then change the dtype to float.

#Question 3: Are there any useful variables that you can engineer with the given data?
#Answer 3: yes since all the data has numerical values

#Quesiton 4: Do you notice any patterns or anomalies in the data? Can you plot them?
#Answer 4: The amount of users in the country "SP" are the highest with 1000 users while next to that is "SA" with 300. This can have an impact on how the merchandise and/or website is marketed to those countries.

In [None]:
marketing_data = '../input/marketing-data/marketing_data.csv'
df = pd.read_csv('../input/marketing-data/marketing_data.csv')
df.head(11)

In [None]:
df.rename (columns = {"Kidhome" : "Kids"}, inplace = True)

In [None]:
df.head(11)

In [None]:
df.rename (columns = {"Teenhome" : "Teens"}, inplace = True)

In [None]:
df.head(11)

In [None]:
df.describe()

In [None]:
plt.figure(figsize = (7, 6))
sns.countplot(x = df['Country'], palette = 'rocket_r')

In [None]:
print(df.columns)

#Here I see that "Income" has spaces and I need to get rid of it and change the type to float in order to use df.describe()

In [None]:
print(df.dtypes)

#Here we see Income is a object and not anything of numerical value and still has spaces

In [None]:
df.columns = df.columns.str.strip()

#This allows me to get rid of the spaces in all headers in my dataframe

In [None]:
print(df.columns)

#Here we see that Income got fixed to no spaces

In [None]:
df['Income'] = df['Income'].str.replace(',', '')
df['Income'] = df['Income'].str.replace('$', '')
df['Income']

#Changed Income row into proper intergers to be ready for conversion

In [None]:
df["Income"] = df["Income"].astype("float")
df["Income"]

#Changed the income row into a float

In [None]:
df.describe()

#Now the income tab is ready to be included in all mathematical equations for future use. As before it could not be used since the type was object.

In [None]:
df["Country"].value_counts().to_frame()
#This is showing the dominance of where the companies users are. My hypothesis is that the company used marketing tools for a specific country and did not take a look at other countries. Something to look at would be units sold per country

*******####Section 02: Statistical Analysis
#Please run statistical tests in the form of regressions to answer these questions & propose data-driven action recommendations to your CMO. Make sure to interpret your results with non-statistical jargon 
#so your CMO can understand your findings.*******

#Question 1: What factors are significantly related to the number of store purchases?
#Answer 1: From looking at linear regressions MEAT seems to have sold a lot and had a positive impact

#Question 2: Does US fare significantly better than the Rest of the World in terms of total purchases?
#Answer 2: Not better. It is in 3rd place behind SP and GER in total sales

#Question 3: Your supervisor insists that people who buy gold are more conservative. Therefore, people who spent an above average amount on gold in the last 2 years would have more in store purchases. Justify or refute this statement using an appropriate statistical test
#Answer 3: Doing a simple linear regression model proves this is correct. The model shows that the average amount of gold purchased is 44~ and instore spending goes up with people who bought that amount(44~) and higher.

#Question 4: Fish has Omega 3 fatty acids which are good for the brain. Accordingly, do "Married PhD candidates" have a significant relation with amount spent on fish? What other factors are significantly related to amount spent on fish? (Hint: use your knowledge of interaction variables/effects)
#Answer 4: 

#Question 5: Is there a significant relationship between geographical regional and success of a campaign?
#Answer 5: It seems that SP has the highest purchases and sales. It would seem that people who live in SP got more out of the campaign.

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

In [None]:
#For question 3 it was asked people who buy over the avergae amount are conservative. The average(mean) amount bought from them is 44~
df['MntGoldProds'].describe().to_frame()

In [None]:
df['Total_Purchases'] = (df['NumDealsPurchases'] + df['NumWebPurchases'] + df['NumCatalogPurchases'] + df['NumStorePurchases'])

df['Total_Sales'] = (df['MntWines'] + df['MntFruits'] + df['MntMeatProducts'] + df['MntFishProducts'] + df['MntSweetProducts'] + df['MntGoldProds'])

In [None]:
df['Total_Sales'].sum()

In [None]:
pm = df[['Total_Purchases', 'Country']].groupby(['Country']).agg([sum])

sns.set_palette('Spectral')
plt.figure(figsize = (7, 7))
plt.pie(pm['Total_Purchases']['sum'], labels = pm.index, explode = (0, 0.5, 0, 0.3, 1, 0, 0, 0.5),
        shadow = True, autopct = '%1.1f%%')
plt.show()

In [None]:
pm = df[['Total_Sales', 'Country']].groupby(['Country']).agg([sum])

sns.set_palette('Spectral')
plt.figure(figsize = (7, 7))
plt.pie(pm['Total_Sales']['sum'], labels = pm.index, explode = (0, 0.5, 0, 0.3, 1, 0, 0, 0.5),
        shadow = True, autopct = '%1.1f%%')
plt.show()

In [None]:
#This is a simple linear regression model comparing the amount of gold purchased in the last 2 years to in store purchases

#This does confirm question 3 that people who spend more on gold tend to have MORE store purchases 

x = df["MntGoldProds"].values.reshape(-1,1)
y = df["NumStorePurchases"].values.reshape(-1,1)

lm = LinearRegression()
lm.fit(x,y)

# Prediction
x_space = np.linspace(min(x), max(x)).reshape(-1,1)
predicted_y = lm.predict(x_space)

# Plotting regression line and scatter
plt.figure(figsize=[15,10])
plt.scatter(x = x, y = y)
plt.plot(x_space, predicted_y, color="red", linewidth=3)
plt.xlabel("MntGoldProds")
plt.ylabel("NumStorePurchases")
plt.show()

# R^2 Score 
print("R^2 Score: {}".format(lm.score(x,y)))

**###Section 03: Data Visualization****
#Please plot and visualize the answers to the below questions.

#Question 1: Which marketing campaign is most successful?
#Answer 1: Meat seems to have done the best and it seems to be the highest in SP country as well which is where majority of the sales are.

#Question 2: What does the average customer look like for this company?
#Answer2: 

#Question 3: Which products are performing best?
#Anwer 3: Meat is doing the best by a landslide. Gold is second best.

#Question 4: Which channels are underperforming?
#Answer 4: By channels I assume you mean where they bought the merchandise from. InStore purchases are really high. The lowest would be the deals.

In [None]:
df["MntMeatProducts"].sum()

In [None]:
#

x = df["MntMeatProducts"].values.reshape(-1,1)
y = df["Total_Sales"].values.reshape(-1,1)

lm = LinearRegression()
lm.fit(x,y)

# Prediction
x_space = np.linspace(min(x), max(x)).reshape(-1,1)
predicted_y = lm.predict(x_space)

# Plotting regression line and scatter
plt.figure(figsize=[15,10])
plt.scatter(x = x, y = y)
plt.plot(x_space, predicted_y, color="red", linewidth=3)
plt.xlabel("MntMeatProducts")
plt.ylabel("Total_Sales")
plt.show()

# R^2 Score 
print("R^2 Score: {}".format(lm.score(x,y)))

In [None]:
df["MntGoldProds"].sum()

In [None]:
x = df["MntGoldProds"].values.reshape(-1,1)
y = df["Total_Sales"].values.reshape(-1,1)

lm = LinearRegression()
lm.fit(x,y)

# Prediction
x_space = np.linspace(min(x), max(x)).reshape(-1,1)
predicted_y = lm.predict(x_space)

# Plotting regression line and scatter
plt.figure(figsize=[15,10])
plt.scatter(x = x, y = y)
plt.plot(x_space, predicted_y, color="red", linewidth=3)
plt.xlabel("MntGoldProds")
plt.ylabel("Total_Sales")
plt.show()

# R^2 Score 
print("R^2 Score: {}".format(lm.score(x,y)))

#R^2 Score is very weak for this correlation since it is only about 25%

In [None]:
df['MntSweetProducts'].sum()

In [None]:
x = df["MntSweetProducts"].values.reshape(-1,1)
y = df["Total_Sales"].values.reshape(-1,1)

lm = LinearRegression()
lm.fit(x,y)

# Prediction
x_space = np.linspace(min(x), max(x)).reshape(-1,1)
predicted_y = lm.predict(x_space)

# Plotting regression line and scatter
plt.figure(figsize=[15,10])
plt.scatter(x = x, y = y)
plt.plot(x_space, predicted_y, color="red", linewidth=3)
plt.xlabel("MntSweetProducts")
plt.ylabel("Total_Sales")
plt.show()

# R^2 Score 
print("R^2 Score: {}".format(lm.score(x,y)))

In [None]:
df["MntFishProducts"].sum()

In [None]:
plt.figure (figsize = (10, 10))

sns.regplot(x = "MntFishProducts", y = df['Total_Sales'], data = df)

plt.ylim(0,)

In [None]:
df["MntFruits"].sum()

In [None]:
plt.figure (figsize = (10, 10))

sns.regplot(x = "MntFruits", y = df['Total_Sales'], data = df)

plt.ylim(0,)

In [None]:
df["MntWines"].sum()

In [None]:
plt.figure (figsize = (10, 10))

sns.regplot(x = "MntWines", y = df['Total_Sales'], data = df)

plt.ylim(0,)

In [None]:
sns.regplot(x = 'NumStorePurchases', y = "MntGoldProds", data = df)
plt.ylim(0,)

In [None]:
sns.regplot(x = 'NumStorePurchases', y = "MntSweetProducts", data = df)
plt.ylim(0,)

In [None]:
sns.regplot(x = 'NumStorePurchases', y = "MntFishProducts", data = df)
plt.ylim(0,)

In [None]:
sns.regplot(x = 'NumStorePurchases', y = "MntMeatProducts", data = df)
plt.ylim(0,)
#Meat products have a high selling point 

In [None]:
sns.regplot(x = 'NumStorePurchases', y = "MntFruits", data = df)
plt.ylim(0,)

In [None]:
sns.regplot(x = 'NumStorePurchases', y = "MntWines", data = df)
plt.ylim(0,)

In [None]:
pm = df[["MntMeatProducts", 'Country']].groupby(['Country']).agg([sum])

sns.set_palette('Spectral')
plt.figure(figsize = (7, 7))
plt.pie(pm["MntMeatProducts"]['sum'], labels = pm.index, explode = (0, 0.5, 0, 0.3, 1, 0, 0, 0.5),
        shadow = True, autopct = '%1.1f%%')
plt.show()

In [None]:
prod = df[['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']].agg([sum]).T

sns.barplot(x = prod.index, y = prod['sum'])

plt.gca().set_xticklabels(['Deals', 'Web', 'Catalog', 'Store'])

plt.xlabel('Service')

plt.ylabel('Amount')