In [1]:
from gensim.models import Doc2Vec
from sklearn import decomposition
from sklearn.linear_model import LogisticRegression
import numpy as np  
import sys
from sklearn import metrics

import pandas as pd
import random as rnd
from random import shuffle

sys.path.append('../')
from twitch import twitch_commons

%pylab inline
matplotlib.use('Agg')
import matplotlib as mpl
import matplotlib.pyplot as plt

fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 12
fig_size[1] = 9
plt.rcParams["figure.figsize"] = fig_size

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



In [2]:
docvec_ids = [line.split(',')[0] for line in open('./channel_random_message_samples.csv.dat', 'r')]
print(len(docvec_ids))

57638


In [3]:
df_list = []
female_channels = [line.strip() for line in open('../female_channels.csv', 'r')]
male_channels = [line.strip() for line in open('../male_channels.csv', 'r')]
for docvec_id in docvec_ids:
    splits = docvec_id.split('_')
    if len(splits) == 2:
        channel = splits[0]
    elif len(splits) == 3:
        channel = splits[0] + "_" + splits[1]
    if channel in female_channels:
        channel_type = 1
        channel_rank = female_channels.index(channel)
        quartile = (channel_rank//50) + 1
    else :
        channel_type = 0
        channel_rank = male_channels.index(channel)
        quartile = (channel_rank//50) + 1
        
    df_list.append((docvec_id, channel, channel_type, channel_rank, quartile)) 

print(len(df_list))

57638


In [4]:
shuffle(df_list)
channel_df = pd.DataFrame(df_list, columns=['docvec_index', 'channel', 'gender', 'channel_rank', 'quartile'])

gender_list = channel_df.gender.values.tolist()
quartile_list = channel_df.quartile.values.tolist()
rank_list = channel_df.channel_rank.values.tolist()

In [5]:
model = Doc2Vec.load('./channel_chats.d2v')

feature_vectors = [model.docvecs[x] for x in channel_df.docvec_index.values.tolist()]

In [6]:
no_of_channels = len(feature_vectors)
print(no_of_channels)

57638


### classification model

In [7]:
def build_classification_model(train_arrays, train_labels, test_arrays, test_lebels):
    classifier = LogisticRegression()
    classifier.fit(train_arrays, train_labels)
    print('model accuracy : ' + str(classifier.score(test_arrays, test_labels)))
    predict_score = classifier.predict_proba(test_arrays)[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(test_labels, predict_score)
    twitch_commons.plot_roc_curve(fpr, tpr)

In [None]:
trainig_data_size = no_of_channels//2
train_arrays = numpy.zeros((trainig_data_size, 100))
train_labels = numpy.zeros(trainig_data_size)
test_arrays = numpy.zeros((no_of_channels-trainig_data_size, 100))
test_labels = numpy.zeros(no_of_channels-trainig_data_size)

for i in range(trainig_data_size):
    train_arrays[i] = feature_vectors[i]
    train_labels[i] = gender_list[i]

for i in range(no_of_channels - trainig_data_size):
    test_arrays[i] = feature_vectors[trainig_data_size + i]
    test_labels[i] = gender_list[trainig_data_size+i]
    
build_classification_model(train_arrays, train_labels, test_arrays, test_labels)

trainig_data_size = no_of_channels//2
train_arrays = numpy.zeros((trainig_data_size, 101))
train_labels = numpy.zeros(trainig_data_size)
test_arrays = numpy.zeros((no_of_channels-trainig_data_size, 101))
test_labels = numpy.zeros(no_of_channels-trainig_data_size)

for i in range(trainig_data_size):
    train_arrays[i] = np.append(feature_vectors[i],quartile_list[i])
    train_labels[i] = gender_list[i]

for i in range(no_of_channels - trainig_data_size):
    test_arrays[i] = np.append(feature_vectors[trainig_data_size + i],quartile_list[trainig_data_size+i])
    test_labels[i] = gender_list[trainig_data_size+i]
    
build_classification_model(train_arrays, train_labels, test_arrays, test_labels)

### clustering

In [None]:
cluster_sample_size = 20000
cluster_xy_vectors = twitch_commons.reduce_dim(feature_vectors[0:cluster_sample_size], 'tsne')



Explained variation per principal component: [ 0.02777437  0.02281112  0.01995092  0.01844645  0.01693605  0.01632945
  0.01486809  0.0146083   0.01403166  0.01377665  0.01332457  0.01301072
  0.0128515   0.01271191  0.01234352  0.01216402  0.01195215  0.01172924
  0.01165623  0.01155279  0.01141864  0.01135101  0.01129528  0.01118586
  0.01111427  0.01098025  0.01091225  0.01086519  0.01079161  0.01069288
  0.01064199  0.01056684  0.01043887  0.01037184  0.01028982  0.01022637
  0.01017484  0.01004326  0.01001328  0.00995065  0.00992581  0.00983311
  0.00978906  0.00971328  0.00966846  0.00958847  0.00956572  0.0095356
  0.00948062  0.00941036]
[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...


In [None]:
colors = []
for idx, g in enumerate(gender_list[0:cluster_sample_size]):
    if g == 1:
        colors.append((400 - rank_list[idx]))
    else:
        colors.append(rank_list[idx])
        
cm = plt.cm.get_cmap('seismic')
plt.scatter([x[0] for x in cluster_xy_vectors],[y[1] for y in cluster_xy_vectors], c=colors, s=16, lw=0, cmap=cm)