# Import Libraries

In [None]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Read in Data

In [None]:
# List files available
print(os.listdir("../input/productmarketfit/"))

In [None]:
# Training data（splited by 10%）
product_source_train = pd.read_csv('../input/productmarketfit/product_source_train.csv')
print('Training data shape: ', product_source_train.shape)
product_source_train.head(10)

In [None]:
# Testing data features（splited by 90%）
product_source_test = pd.read_csv('../input/productmarketfit/product_source_test.csv')
print('Testing data shape: ', product_source_test.shape)
product_source_test.head(10)

In [None]:
# Validating data features
product_source_validate = pd.read_csv('../input/productmarketfit/product_source_validate.csv')
print('validating data shape: ', product_source_validate.shape)
product_source_validate.head()

# Exploratory Data Analysis

# **Distribution of the Target Column**

In [None]:
product_source_train['Target'].value_counts()

In [None]:
product_source_train['Target'].astype(int).plot.hist();

# **Examine Missing Values**

In [None]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [None]:
# Missing values statistics
missing_values = missing_values_table(product_source_train)
missing_values.head(20)

# **Review Column Type**

In [None]:
# Number of each type of column
product_source_train.dtypes.value_counts()

In [None]:
# Number of unique classes in each object column
product_source_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

# **Encode Categorical Variables**


# Label Encoding

In [None]:
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in product_source_train:
    if product_source_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(product_source_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(product_source_train[col])
            # Transform both training and testing data
            product_source_train[col] = le.transform(product_source_train[col])
            product_source_test[col] = le.transform(product_source_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

# One-Hot Encoding

In [None]:
# one-hot encoding of categorical variables
product_source_train = pd.get_dummies(product_source_train)
product_source_test = pd.get_dummies(product_source_test)

print('Training Features shape: ', product_source_train.shape)
print('Testing Features shape: ', product_source_test.shape)

# **Align Training and Testing Data**

In [None]:
train_labels = product_source_train['Target']

# Align the training and testing data, keep only columns present in both dataframes
product_source_train, product_source_test = product_source_train.align(product_source_test, join = 'inner', axis = 1)

# Add the target back in
product_source_train['Target'] = train_labels

print('Training Features shape: ', product_source_train.shape)
print('Testing Features shape: ', product_source_test.shape)

# **Pearson Correlation Coefficient**

In [None]:
# Find correlations with the target and sort
correlations = product_source_train.corr()['Target'].sort_values()

# Display correlations
print('Most Positive Correlations:\n', correlations.tail(42))
print('\nMost Negative Correlations:\n', correlations.head(42))

# Effect of Product's Unit Price on Product-market Fit

In [None]:
(product_source_train['Unit_Price']).describe()

In [None]:
product_source_train['Unit_Price'].plot.hist(title = 'Unit Price Histogram',edgecolor = 'k',bins = np.arange(0,500,25));
plt.xlabel('Product Unit Prices (Bin=25)');

In [None]:
plt.figure(figsize = (10, 8))

# KDE plot of product choices that were not selected
sns.kdeplot(product_source_train.loc[product_source_train['Target'] == 0, 'Unit_Price'], label ='target == 0')

# KDE plot of product choices which were selected
sns.kdeplot(product_source_train.loc[product_source_train['Target'] == 1, 'Unit_Price'], label ='target == 1')

# Labeling of plot
plt.xlabel('Unit_Price'); plt.ylabel('Density'); plt.title('Distribution of Unit Prices');

In [None]:
# Price information into a separate dataframe
price_data = product_source_train[['Target', 'Unit_Price']]
price_data['Unit_Cost'] = price_data['Unit_Price']

# Bin the price data
price_data['Prices_Binned'] = pd.cut(price_data['Unit_Cost'], bins = np.linspace(0, 1000, num = 11))
price_data.head(10)

In [None]:
# Group by the bin and calculate averages
price_groups  = price_data.groupby('Prices_Binned').mean()
price_groups

In [None]:
plt.figure(figsize = (8, 8))

# Graph the price bins and the average of the target as a bar plot
plt.bar(price_groups.index.astype(str), 100 * price_groups['Target'])

# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('Price Group (USD)'); plt.ylabel('Product Market Fit (%)')
plt.title('Product Market Fit by Price Group');

In [None]:
ax = sns.boxplot(x='Target', y='Unit_Price', 
                data=product_source_train)
ax = sns.stripplot(x="Target", y='Unit_Price',
                   data=product_source_train, jitter=True,
                   edgecolor="gray")
#sns.plt.title("Class w.r.t stalkcolor above ring",fontsize=12)

# Effect of Supplier's Region on Product-market Fit

In [None]:
# Find the correlation of the positive Supplier_Region_United States and target
product_source_train['Unit_Price'] = abs(product_source_train['Unit_Price'])
product_source_train['Unit_Price'].corr(product_source_train['Target'])

In [None]:
# Find the correlation of the positive Supplier_Region_United States and target
product_source_train['Supplier_Region_United States'] = abs(product_source_train['Supplier_Region_United States'])
product_source_train['Supplier_Region_United States'].corr(product_source_train['Target'])

In [None]:
product_source_train['Supplier_Region_United States'].plot.hist(title = 'Unit Price Histogram');
plt.xlabel('Supplier_Region_United States');

In [None]:
plt.figure(figsize = (10, 8))

# KDE plot of product choices that were not selected
sns.kdeplot(product_source_train.loc[product_source_train['Target'] == 0, 'Supplier_Region_United States'],label = 'target == 0')

# KDE plot of product choices which were selected
sns.kdeplot(product_source_train.loc[product_source_train['Target'] == 1, 'Supplier_Region_United States'],label = 'target == 1')

# Labeling of plot
plt.xlabel('Unit_Price'); plt.ylabel('Density'); plt.title('Distribution of Unit Prices');

# Top 4 Strongest positively Relative Variables

In [None]:
# Extract the most strongest positively relative variables and show correlations
ext_data = product_source_train[['Target', 'Comments_Frequency', 'Updates_Frequency', 'Recommend_By_Editer', 'Supplier_Region_United States']]
ext_data_corrs = ext_data.corr()
ext_data_corrs

In [None]:
plt.figure(figsize = (8, 6))

# Heatmap of correlations
sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.6)
plt.title('Correlation Heatmap');

# Top 10 Strongest Postitively Relative Regions

In [None]:
# Extract the Top 10 strongest positively relative supplier regions and show correlations
region_data = product_source_train[['Target', 'Supplier_Region_United States', 'Supplier_Region_China', 'Supplier_Region_Israel', 'Supplier_Region_Puerto Rico', 'Supplier_Region_Denmark', 'Supplier_Region_Slovakia', 'Supplier_Region_Australia', 'Supplier_Region_Switzerland', 'Supplier_Region_Finland', 'Supplier_Region_Colombia']]
region_data_corrs = region_data.corr()
region_data_corrs

# Top 10 Strongest Negatively Relative Regions

In [None]:
# Extract the most strongest negatively relative supplier regions and show correlations
region_data = product_source_train[['Target', 'Supplier_Region_Singapore', 'Supplier_Region_Italy', 'Supplier_Region_Korea, Republic of', 'Supplier_Region_Hong Kong', 'Supplier_Region_Austria', 'Supplier_Region_Ukraine', 'Supplier_Region_France',  'Supplier_Region_Germany', 'Supplier_Region_Portugal']]
region_data_corrs = region_data.corr()
region_data_corrs

# Product Category Analysis and Comparison

In [None]:
# Extract the most strongest negatively relative supplier regions and show correlations
region_data = product_source_train[['Target', 'Product_Category_Phones & Accessories', 'Product_Category_Home', 'Product_Category_Productivity', 'Product_Category_Audio', 'Product_Category_Camera Gear', 'Product_Category_Travel & Outdoors', 'Product_Category_Energy & Green Tech',  'Product_Category_Transportation', 'Product_Category_Food & Beverages', 'Product_Category_Health & Fitness', 'Product_Category_Fashion & Wearables', 'Product_Category_Education']]
region_data_corrs = region_data.corr()
region_data_corrs

# Baseline：Logistic Regression Implementation

In [None]:

from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# Median imputation of missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Drop the target from the training data
if 'Target' in product_source_train:
    train = product_source_train.drop(columns = ['Target'])
else:
    train = product_source_train.copy()
    
# Feature names
features = list(train.columns)

# Copy of the testing data
test = product_source_test.copy()

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
imputer.fit(train)

# Transform both training and testing data
train = imputer.transform(train)
test = imputer.transform(product_source_test)

# Repeat with the scaler
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression

# Make the model with the specified regularization parameter
log_reg = LogisticRegression(C = 0.0001)

# Train on the training data
log_reg.fit(train, train_labels)

In [None]:
# Make predictions
# Make sure to select the second column only
log_reg_pred = log_reg.predict_proba(test)[:, 1]

In [None]:
# Submission dataframe
submit = product_source_test[['ID']]
submit['Target'] = log_reg_pred

submit.head()

In [None]:
submit.to_csv('log_reg_baseline.csv', index = False)

# Improved Model: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Make the random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)

In [None]:
# Train on the training data
random_forest.fit(train, train_labels)

# Extract feature importances
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features, 'importance': feature_importance_values})

# Make predictions on the test data
predictions = random_forest.predict_proba(test)[:, 1]

In [None]:
# Make a submission dataframe
submit = product_source_test[['ID']]
submit['Target'] = predictions

# Save the submission dataframe
submit.to_csv('random_forest_baseline.csv', index = False)

# Model Interpretation: Feature Importances

In [None]:
def plot_feature_importances(df):
    """
    Plot importances returned by a model. This can work with any measure of
    feature importance provided that higher importance is better. 
    
    Args:
        df (dataframe): feature importances. Must have the features in a column
        called `features` and the importances in a column called `importance
        
    Returns:
        shows a plot of the 15 most importance features
        
        df (dataframe): feature importances sorted by importance (highest to lowest) 
        with a column for normalized importance
        """
    
    # Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    
    # Normalize the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()

    # Make a horizontal bar chart of feature importances
    plt.figure(figsize = (10, 6))
    ax = plt.subplot()
    
    # Need to reverse the index to plot most important on top
    ax.barh(list(reversed(list(df.index[:15]))), 
            df['importance_normalized'].head(15), 
            align = 'center', edgecolor = 'k')
    
    # Set the yticks and labels
    ax.set_yticks(list(reversed(list(df.index[:15]))))
    ax.set_yticklabels(df['feature'].head(15))
    
    # Plot labeling
    plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
    plt.show()
    
    return df

In [None]:
# Show the feature importances for the default features
feature_importances_sorted = plot_feature_importances(feature_importances)

# Validation Metric: ROC AUC

In [None]:
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(product_source_validate['Target'], predictions)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate,true_positive_rate, color='red',label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],linestyle='--')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')