# Negative Pruners
---
## Imports section

In [79]:
%matplotlib inline

import numpy as np
import pandas as pd
import nltk

import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
# from sklearn.cross_validation import train_test_split
# from sklearn import metrics
# from sklearn.cross_validation import cross_val_score
# from sklearn.preprocessing import normalize

plt.rcParams['figure.figsize'] = 16, 12

### loading data

In [3]:
df = pd.read_csv('dataset.tsv', sep='\t', skiprows=1, skipfooter=1, engine='python')
df.columns = ['match', 'query', 'added', 'campaign', 'group', 'clicks', 'impressions', 'ctr', 'cpc', 'cost', 'position',
              'conversion']
# print df.columns
# print df.describe()

# print df.std()

# df.hist()
# plt.show()

In [141]:
terms = set()
costIndex = {}
zero = set()

for i, query in df['query'].iteritems():
    query = query.decode('unicode-escape')
    cost = float(df['cost'][i].replace(',', ''))
    for word in query.split():
        if len(word) >= 3 and word not in nltk.corpus.stopwords.words('english') and nltk.corpus.wordnet.synsets(word):
            terms.add(word)
            costIndex[word] = cost
            if df['conversion'][i] == 0:
                zero.add(word)

### transforming words into vector

In [5]:
vec = CountVectorizer(lowercase=True, stop_words='english', vocabulary=terms, binary=True)

data = vec.fit_transform(df['query']).toarray()

print (vec.transform(['list']).toarray())

[[0 0 0 ..., 0 0 0]]


In [92]:
y = df['conversion'].copy()
y[y > 0] = 1

model = LogisticRegression()
model = model.fit(data, y)

print 'score: ', model.score(data, y)
print 'prediction :', model.predict(vec.transform(["themes branded alternatives cloud fast discussion combining"]))

conversion_weights = zip(model.coef_[0], vec.get_feature_names())
conversion_weights.sort()

abs_minimum_conversion = abs(conversion_weights[0][0])
normalization_dividend_conversion = abs_minimum_conversion + conversion_weights[-1][0]
conversion_weights = [(((n + abs_minimum_conversion) / normalization_dividend_conversion), k) for n, k in conversion_weights]
print conversion_weights

score:  0.973255813953
prediction : [1]
[(0.0, u'blog'), (0.019471933330608697, u'page'), (0.02020876839255881, u'new'), (0.047402890736152087, u'scratch'), (0.069444260145286443, u'kids'), (0.11079778245711247, u'custom'), (0.11382529888354606, u'music'), (0.12284625376440603, u'way'), (0.12788646085185565, u'download'), (0.13198209809471903, u'develop'), (0.13722555953838356, u'design'), (0.13781631436973996, u'html'), (0.15453303602303747, u'code'), (0.16433190215116955, u'people'), (0.16568948050691959, u'simple'), (0.16741594978728469, u'www'), (0.1739646396648411, u'money'), (0.18911747846913673, u'cost'), (0.1928798930456315, u'good'), (0.20315856477606639, u'members'), (0.20657991864105524, u'pages'), (0.20744518130225365, u'steps'), (0.21015048928198707, u'help'), (0.2152647668568933, u'business'), (0.22000724887299358, u'groups'), (0.22293732814946413, u'tools'), (0.22298604082782983, u'club'), (0.22524101171815089, u'game'), (0.23296051827187386, u'account'), (0.247222187331

### CPC linear regressor

In [8]:
cpc = df['cpc'].copy()
costModel = LinearRegression()
costModel = costModel.fit(data, cpc)

In [99]:
costs = zip(costModel.coef_, vec.get_feature_names())
costs = [(n, k) for n, k in costs if n > -100 and n < 100]
costs.sort()
# print (cpc.describe())
# print costModel.predict(vec.transform(['htto www pkr en community forums pkr lounge mini masters crash official statement']).toarray())

abs_minimum_costs = abs(costs[0][0])
normalization_dividend_costs = abs_minimum_costs + costs[-1][0]
costs_weights = [(((n + abs_minimum_costs) / normalization_dividend_costs), k) for n, k in costs]

# we need to reverse cost because higher cost is bad for us and lower cost is better (reverse if conversion weights)
costs_weights_reversed = [(1-c[0], c[1]) for c in costs_weights]
costs_weights_reversed.sort()

print costs_weights_reversed

[(0.0, u'spent'), (0.27161226299914409, u'earning'), (0.32133976888535654, u'stuck'), (0.33288117445904475, u'organizer'), (0.33568313607776934, u'organisation'), (0.34321953082473677, u'negative'), (0.35662295792077336, u'breathing'), (0.36501923596556884, u'synagogue'), (0.3691375707759138, u'gardeners'), (0.37999133268880558, u'photographer'), (0.39142699174249651, u'tampa'), (0.39451371363078946, u'animated'), (0.39517442064064923, u'athletic'), (0.39563801299821866, u'sick'), (0.3980850313959382, u'grade'), (0.3988956811303328, u'renewal'), (0.40704842733626323, u'geography'), (0.42921356878757966, u'unity'), (0.4311698342458854, u'wbs'), (0.43325441040483648, u'poetry'), (0.44025539780595091, u'auction'), (0.44866355593129492, u'scenarios'), (0.4497752126611424, u'des'), (0.4501986525869599, u'harrow'), (0.45027921842358842, u'positive'), (0.45350970743846264, u'activity'), (0.45432336597654877, u'sustaining'), (0.45560968200261243, u'past'), (0.45700789982503121, u'optimize'), (

### combine scores

In [153]:
conversions_dict = dict([(k, n) for n, k in conversion_weights])
costs_dict = dict([(k, n) for n, k in costs_weights_reversed])

combined = {}
for k in conversions_dict.keys():
    if k in costs_dict:
#         combined[k] = 2 * conversions_dict[k] * costs_dict[k] / (conversions_dict[k] + costs_dict[k])
        combined[k] = (1 * conversions_dict[k] + 1 * costs_dict[k]) / 2

combined_tuples = sorted(combined.items(), key= lambda x : x[1])

negatives = [w for w,_ in combined_tuples[:1000]]

cut_costs = 0
total_negatives_cost = 0
for i, q in df['query'].iteritems():
    if any(word in negatives for word in q.split()) and df['conversion'][i] == 0:
        cut_costs += float(df['cost'][i])
    total_negatives_cost += float(df['cost'][i].replace(',', ''))

print cut_costs/total_negatives_cost

0.466695472811




### Clicks linear regressor

In [41]:
clicks = df['clicks'].copy()
clicksModel = LinearRegression()
clicksModel = clicksModel.fit(data, clicks)

In [85]:
clicks_weights = zip(clicksModel.coef_, vec.get_feature_names())
clicks_weights = [c for c in clicks_weights if c[0] < 100 and c[0] > -100]
clicks_weights.sort()

abs_minimum = abs(clicks_weights[0][0])
normalization_dividend = abs_minimum + clicks_weights[-1][0]
clicks_weights = [(((n + abs_minimum) / normalization_dividend), k) for n, k in clicks_weights]

print clicks_weights

[(0.0, u'competitive'), (0.0047078908922703541, u'visual'), (0.0064593666116941478, u'sleek'), (0.018181376489500654, u'nice'), (0.041273764292177259, u'resource'), (0.059143761113491851, u'edge'), (0.067891640261990641, u'rank'), (0.068572548654095414, u'fronter'), (0.073293525216608896, u'pakistan'), (0.073966438605519894, u'machinist'), (0.074526951586073226, u'supplier'), (0.075044432255903376, u'conferences'), (0.076300578779336242, u'clannish'), (0.077526469723616515, u'serving'), (0.078215059066804787, u'enterprises'), (0.079068986487907889, u'investor'), (0.079344420258752435, u'validation'), (0.079436965048384628, u'peon'), (0.079470230757393789, u'bracelets'), (0.079539999915164139, u'garage'), (0.07966169954531295, u'needs'), (0.079974782373133518, u'beats'), (0.080152441046091163, u'integrating'), (0.080163106074589707, u'compared'), (0.08066753133938126, u'adverts'), (0.080889782030487842, u'question'), (0.081319466078870625, u'hassle'), (0.08142474975180089, u'original'),