In [None]:
# Import needed Librarys
import pandas as pd
import math, datetime
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib import style
import pickle
import re
from sklearn.cluster import KMeans

style.use('ggplot')
%matplotlib inline

In [None]:
# Data Source: GSC exported via https://searchanalyticsforsheets.com/

In [None]:
df = pd.read_csv("data/example_crt_prediction.csv", sep=",", thousands='.')

In [None]:
df.head(10)

In [None]:
# Convert Strings to Numbers
df.CTR = df.CTR.str.replace("%", "").str.replace(",", ".").astype(float)
df.Position = df.Position.str.replace(",", ".").astype(float)
df['Position'] = pd.to_numeric(df['Position'])
df['CTR'] = pd.to_numeric(df['CTR'])
df = df.round(0)

In [None]:
# Delete rows with empty data
df.dropna(inplace=True)

In [None]:
df.head()

In [None]:
# Show most Correlating Columns to CTR
df.corr()["CTR"]

In [None]:
# Define features and target
# features: what columns should algorithm take as base to make predictions
features = ["Position", "Impressions"]  # you can change features to see prediction changing
# target: what to predict
target = "CTR"

In [None]:
# Split Data in Train and Testset
train = df.sample(frac=0.8)
test = df.loc[~df.index.isin(train.index)]

In [None]:
print ("Train rows: {}".format(len(train.index)))
print ("Test rows: {}".format(len(test.index)))

In [None]:
# Import different Alorithms to see differences between their predictions
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Helper Function to print readable scores
def print_scores(scores):
    r = 1
    for score in scores:
        print("Run: {} - Score: {}".format(r, score))
        r += 1

In [None]:
# Model 1
# Define which Model to Use
LinearRegressionModel = LinearRegression()

In [None]:
# Train the Model
LinearRegressionModel.fit(train[features], train[target])

In [None]:
# Test how the model performes against the Training data we split above...
prediction_score = LinearRegressionModel.score(test[features], test[target])
print("The score of prediction for LinearRegressionModel is: {}".format(prediction_score))

In [None]:
# Model 2 (same steps like model 1 but in one cell)
DecisionTreeClassifierModel = DecisionTreeClassifier()
DecisionTreeClassifierModel.fit(train[features], train[target])
prediction_score = DecisionTreeClassifierModel.score(test[features], test[target])
print("The score of prediction for DecisionTreeClassifierModel is: {}".format(prediction_score))

In [None]:
# Model 3 / tuning Hyperparameters
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=200))
 
# Declare hyperparameters to tune
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [5, 3]}
 
# Tune model using cross-validation pipeline
RandomForestRegressorModel = GridSearchCV(pipeline, hyperparameters, cv=5)
 
RandomForestRegressorModel.fit(train[features], train[target])
prediction_score = RandomForestRegressorModel.score(test[features], test[target])
print("The score of prediction for RandomForestRegressorModel is: {}".format(prediction_score))

In [None]:
# Print Predictions for all created Models

# Define parameters for Predictions 
# (in this case: what CTR we have vor a Keyword on position 2 with 200 impressions)
position = 2.0
impressions = 200
data = [[position, impressions]]  # needs to be same count as features

df_to_predict = pd.DataFrame(data = data, index=[0], columns=features)
res = LinearRegressionModel.predict(df_to_predict)
print("LinearRegressionModel predicted:       {}% CTR".format(int(res[0])))
res = DecisionTreeClassifierModel.predict(df_to_predict)
print("DecisionTreeClassifierModel predicted: {}% CTR".format(int(res[0])))
res = RandomForestRegressorModel.predict(df_to_predict)
print("RandomForestRegressorModel predicted:  {}% CTR".format(int(res[0])))

In [None]:
# Helper Function to plot Models
import matplotlib.pyplot as plt

def plt_ctr_from_to_position(models, features, from_pos, to_pos, data):
    for model in models:
        predictions_x = []
        predictions_y = []
        positions = range(from_pos, to_pos)
        for pos in positions:
            df_to_predict = pd.DataFrame(data = data, index=[0], columns=features)
            predictions_x.append(pos)
            predictions_y.append(model.predict(df_to_predict)[0])
        predictions_x, predictions_y
        plt.plot(predictions_x, predictions_y)

In [None]:
plt_ctr_from_to_position([LinearRegressionModel, DecisionTreeClassifierModel, RandomForestRegressorModel], features, 1, 20, data)