In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.externals import joblib
from my_measures import BinaryClassificationPerformance
%matplotlib inline



### read and summarize data

In [4]:
f = '../data/toxiccomments_train.csv'
toxic_data = pd.read_csv(f)

In [5]:
print("toxic_data is:", type(toxic_data))
print("toxic_data has", toxic_data.shape[0], "rows and", toxic_data.shape[1], "columns", "\n")
print("the data types for each of the columns in toxic_data:")
print(toxic_data.dtypes, "\n")
print("the first 10 rows in toxic_data:")
print(toxic_data.head(1))

toxic_data is: <class 'pandas.core.frame.DataFrame'>
toxic_data has 159571 rows and 8 columns 

the data types for each of the columns in toxic_data:
id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object 

the first 10 rows in toxic_data:
                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  


In [6]:
print("The rate of toxic comments in the dataset: ")
print(toxic_data['toxic'].mean())

The rate of toxic comments in the dataset: 
0.09584448302009764


### Feature extraction on natural language data

In [None]:
# # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer()
# corpus = toxic_data.comment_text.as_matrix()
# X_bag_of_words = vectorizer.fit_transform(corpus)
# print(X_bag_of_words.toarray())

In [49]:
# vectorize Bag of Words from review text; as sparse matrix
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=2 ** 30, alternate_sign=True)
X_hv = hv.fit_transform(toxic_data.comment_text)
print(X_hv.shape)

(159571, 1073741824)


In [8]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X_tfidf = transformer.fit_transform(X_hv)
print(X_tfidf.shape)

(159571, 131072)


In [9]:
print(type(X_tfidf))

<class 'scipy.sparse.csr.csr_matrix'>


In [53]:
pip install sister
import sister
embedder = sister.MeanEmbedding(lang="en")

sentence = "I am a dog."
vector = embedder(sentence)  # 300-dim vector

SyntaxError: invalid syntax (<ipython-input-53-f69aa179dfff>, line 1)

### Create additional quantitative features

In [11]:
# features from Amazon.csv to add to feature set
toxic_data['word_count'] = toxic_data['comment_text'].str.split(' ').str.len()
print(toxic_data['word_count']);
toxic_data['punc_count'] = toxic_data['comment_text'].str.count("\.")
#my first own feature
toxic_data['excl_count'] = toxic_data['comment_text'].str.count("!")





X_quant_features = toxic_data[["word_count", "punc_count", "excl_count"]]
print(X_quant_features.head(100))
print(type(X_quant_features))

0          42
1          18
2          42
3         112
4          13
         ... 
159566     49
159567     19
159568     13
159569     25
159570     39
Name: word_count, Length: 159571, dtype: int64
    word_count  punc_count  excl_count
0           42           5           0
1           18           2           1
2           42           3           0
3          112           3           0
4           13           1           0
..         ...         ...         ...
95          84           4           0
96          11           1           0
97          23           0           0
98          31           0           0
99          52           2           0

[100 rows x 3 columns]
<class 'pandas.core.frame.DataFrame'>


### Combine all quantitative features into a single sparse matrix

In [12]:
from scipy.sparse import csr_matrix, hstack
X_quant_features_csr = csr_matrix(X_quant_features)
X_combined = hstack([X_tfidf, X_quant_features_csr])
X_matrix = csr_matrix(X_combined) # convert to sparse matrix
print(X_matrix.shape)

(159571, 131075)


In [13]:
# look at an example of a "row" of a sparse matrix
print(X_matrix[10,203])

0.06089812150584378


### Create `X`, scaled matrix of features

In [14]:
# feature scaling
# mittelwert ist danach null, standard deviation ist 1
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X = sc.fit_transform(X_matrix)
print(X.shape)

(159571, 131075)


In [15]:
# look at an example of a "row" of a sparse matrix, after scaling
print(X[10,203])

1.4946692372770471


# Create Training and Test Sets

In [16]:
# IMPORTANT, enter an integer into the variable below; any integer other than 74
my_random_state = 700

# create training and test sets
from sklearn.model_selection import train_test_split

# enter an integer for the random_state parameter; any integer will work
X_train, X_test, y_train, y_test, X_raw_train, X_raw_test = train_test_split(X, toxic_data['toxic'], toxic_data, test_size= 0.2, random_state=my_random_state)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(X_raw_train.shape)
print(X_raw_test.shape)

(127656, 131075)
(31915, 131075)
(127656,)
(31915,)
(127656, 11)
(31915, 11)


In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification 



##X_train, y_train = make_classification(n_samples=80000, n_features=1000, n_informative=2, n_redundant=0, random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=4, random_state=0)
clf.fit(X_test, y_test)

RandomForestClassifier(max_depth=2, random_state=0)

                             
##print(clf.feature_importances_)
##print(clf.predict(self, X_test)
print(clf.predict(X_test))

##score(self, X, y[, sample_weight])
##Return the mean accuracy on the given test data and labels.
print(clf.score(X_test,y_test))

##from sklearn.metrics import confusion_matrix
##y_true = X_test
##y_pred = y_test
##confusion_matrix(y_true, y_pred)




[0 0 0 ... 0 0 0]
0.9025536581544729


# Fit Models

### MODEL: ordinary least squares

In [2]:
from sklearn import linear_model
ols = linear_model.SGDClassifier(loss="squared_loss")
ols.fit(X_train, y_train)

ols_performance_train = BinaryClassificationPerformance(ols.predict(X_train), y_train, 'ols_train')
ols_performance_train.compute_measures()
print(ols_performance_train.performance_measures)

NameError: name 'X_train' is not defined

### MODEL: SVM, linear

In [1]:
from sklearn import linear_model
svm = linear_model.SGDClassifier()
svm.fit(X_train, y_train)

svm_performance_train = BinaryClassificationPerformance(svm.predict(X_train), y_train, 'svm_train')
svm_performance_train.compute_measures()
print(svm_performance_train.performance_measures)

NameError: name 'X_train' is not defined

### MODEL: logistic regression

In [24]:
from sklearn import linear_model
lgs = linear_model.SGDClassifier(loss='log', n_iter_no_change=50, alpha=0.00001)
lgs.fit(X_train, y_train)

lgs_performance_train = BinaryClassificationPerformance(lgs.predict(X_train), y_train, 'lgs_train')
lgs_performance_train.compute_measures()
print(lgs_performance_train.performance_measures)

{'Pos': 12184.0, 'Neg': 141161.0, 'TP': 903, 'TN': 83476, 'FP': 8771, 'FN': 8817, 'Accuracy': 0.5502559587857445, 'Precision': 0.09334298118668596, 'Recall': 0.07411359159553513, 'desc': 'lgs_train'}


### MODEL: Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
nbs = MultinomialNB()
nbs.fit(X_train, y_train)

nbs_performance_train = BinaryClassificationPerformance(nbs.predict(X_train), y_train, 'nbs_train')
nbs_performance_train.compute_measures()
print(nbs_performance_train.performance_measures)

### MODEL: Perceptron

In [None]:
from sklearn import linear_model
prc = linear_model.SGDClassifier(loss='perceptron')
prc.fit(X_train, y_train)

prc_performance_train = BinaryClassificationPerformance(prc.predict(X_train), y_train, 'prc_train')
prc_performance_train.compute_measures()
print(prc_performance_train.performance_measures)

### MODEL: Ridge Regression Classifier

In [51]:
from sklearn import linear_model
rdg = linear_model.RidgeClassifier(alpha=5.0)
rdg.fit(X_train, y_train)

rdg_performance_train = BinaryClassificationPerformance(rdg.predict(X_train), y_train, 'rdg_train')
rdg_performance_train.compute_measures()
print(rdg_performance_train.performance_measures)

{'Pos': 12184.0, 'Neg': 141161.0, 'TP': 792, 'TN': 84811, 'FP': 7436, 'FN': 8928, 'Accuracy': 0.5582379601552055, 'Precision': 0.0962566844919786, 'Recall': 0.06500328299409061, 'desc': 'rdg_train'}


### What is the distribution of weights, OLS vs. ridge? 

In [None]:
print(rdg)

In [None]:
plt.hist(ols.coef_[0])
plt.show()

plt.hist(rdg.coef_[0])
plt.show()

### ROC plot to compare performance of various models and fits

In [None]:
fits = [ols_performance_train, svm_performance_train, lgs_performance_train, nbs_performance_train, prc_performance_train, rdg_performance_train]

for fit in fits:
    plt.plot(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], 'bo')
    plt.text(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], fit.desc)
plt.axis([0, 1, 0, 1])
plt.title('ROC plot: test set')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.show()

## a digression: looking inside the `rdg` object

[documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html)

In [None]:
dir(rdg)

In [None]:
rdg.get_params()

In [None]:
type(rdg.coef_)

In [None]:
rdg.coef_.shape

In [None]:
print(rdg.coef_[0])

In [None]:
plt.hist(rdg.coef_[0])
plt.show()

### looking at reviews based on their classification

Let's say we decide that Ridge Regression is the best model for generalization. Let's take a look at some of the reviews and try to make a (subjective) determination of whether it's generalizing well. 

In [None]:
toxic_data.loc[0, "comment_text"]

In [None]:
ridge_predictions = rdg.predict(X_train)

In [None]:
ridge_predictions.shape

In [None]:
# false positives

print("Examples of false positives:")

import random, time

for i in range(0, len(ridge_predictions)):
    if (ridge_predictions[i] == True):
        if (toxic_data.loc[i, "toxic"] == False):
            if (random.uniform(0, 1) < 0.05):
                print(i)
                print(toxic_data.loc[i, "comment_text"])
                print('* * * * * * * * * ')

---

# <span style="color:red">WARNING: Don't look at test set performance too much!</span>

---

The following cells show performance on your test set. Do not look at this too often! 

# Look at performance on the test set

### MODEL: ordinary least squares

In [None]:
ols_performance_test = BinaryClassificationPerformance(ols.predict(X_test), y_test, 'ols_test')
ols_performance_test.compute_measures()
print(ols_performance_test.performance_measures)

### MODEL: SVM, linear

In [None]:
svm_performance_test = BinaryClassificationPerformance(svm.predict(X_test), y_test, 'svm_test')
svm_performance_test.compute_measures()
print(svm_performance_test.performance_measures)

### MODEL: logistic regression

In [None]:
lgs_performance_test = BinaryClassificationPerformance(lgs.predict(X_test), y_test, 'lgs_test')
lgs_performance_test.compute_measures()
print(lgs_performance_test.performance_measures)

### MODEL: Naive Bayes

In [None]:
nbs_performance_test = BinaryClassificationPerformance(nbs.predict(X_test), y_test, 'nbs_test')
nbs_performance_test.compute_measures()
print(nbs_performance_test.performance_measures)

### MODEL: Perceptron

In [None]:
prc_performance_test = BinaryClassificationPerformance(prc.predict(X_test), y_test, 'prc_test')
prc_performance_test.compute_measures()
print(prc_performance_test.performance_measures)

### MODEL: Ridge Regression Classifier

In [None]:
rdg_performance_test = BinaryClassificationPerformance(rdg.predict(X_test), y_test, 'rdg_test')
rdg_performance_test.compute_measures()
print(rdg_performance_test.performance_measures)

### ROC plot to compare performance of various models and fits

In [None]:
fits = [ols_performance_test, svm_performance_test, lgs_performance_test, nbs_performance_test, prc_performance_test, rdg_performance_test]

for fit in fits:
    plt.plot(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], 'bo')
    plt.text(fit.performance_measures['FP'] / fit.performance_measures['Neg'], 
             fit.performance_measures['TP'] / fit.performance_measures['Pos'], fit.desc)
plt.axis([0, 1, 0, 1])
plt.title('ROC plot: test set')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.show()