In [2]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.datasets import fetch_20newsgroups

In [3]:
categories = ['alt.atheism', 'soc.religion.christian']
remove = ('headers', 'footers', 'quotes')
twenty_train = fetch_20newsgroups(subset='train',
                                 remove=remove,
                                 categories=categories)
twenty_test = fetch_20newsgroups(subset='test',
                                 remove=remove,
                                 categories=categories)

In [4]:
tf_vec = TfidfVectorizer()
X_train_tfidf = tf_vec.fit_transform(twenty_train.data)
X_test_tfidf = tf_vec.transform(twenty_test.data)

In [5]:
model = LinearSVC()
# fitting
model.fit(X_train_tfidf, twenty_train.target)
# predict test labels
predicted = model.predict(X_test_tfidf)
# 
np.mean(predicted == twenty_test.target)

0.8089260808926081

In [6]:
X_train_tfidf.data

array([0.02829614, 0.02343459, 0.01819597, ..., 0.08984162, 0.11221961,
       0.11370739])

In [7]:
corpus = [
    '無料 ごはん おかず ごはん おかず ディナー クーポン クーポン 食事', 
    '無料 ごはん おかず ごはん おかず ランチ  クーポン クーポン 食事 昼飯',
    '']
# Bag of Wordsを計算
count_vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
bow_vec = count_vectorizer.fit_transform(corpus)
# Dictionary of Key方式と言われる疎行列形式のデータ構造っぽい
# 行、列のタプルでkeyを、その値に数値を入れる
# (0, 1) 3 >> 0行1列の値は3である。みたいな書き方
print(bow_vec)

  (0, 6)	1
  (0, 1)	2
  (0, 0)	2
  (0, 3)	1
  (0, 2)	2
  (0, 7)	1
  (1, 6)	1
  (1, 1)	2
  (1, 0)	2
  (1, 2)	2
  (1, 7)	1
  (1, 4)	1
  (1, 5)	1


In [8]:
# 行：文書に対応
# 列：カウント対象の単語に対応
bow_vec.toarray()

array([[2, 2, 2, 1, 0, 0, 1, 1],
       [2, 2, 2, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0]])

In [9]:
# 列、つまりカウント対象の単語を見たい場合は
count_vectorizer.get_feature_names()

['おかず', 'ごはん', 'クーポン', 'ディナー', 'ランチ', '昼飯', '無料', '食事']

In [10]:
# 次は同じようなことをtf-idfでやる

In [11]:
# token pattern についてはこことか。要は1文字のやつを逃したくない。英語と違って意味があるから。
# https://otknoy.hatenablog.com/entry/2015/10/11/200650
tfidf_vectorizer = TfidfVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
bow_vec = tfidf_vectorizer.fit_transform(corpus)
print(bow_vec)

  (0, 7)	0.252145197440413
  (0, 2)	0.504290394880826
  (0, 3)	0.3315406335704443
  (0, 0)	0.504290394880826
  (0, 1)	0.504290394880826
  (0, 6)	0.252145197440413
  (1, 5)	0.3146958984507801
  (1, 4)	0.3146958984507801
  (1, 7)	0.23933434220122643
  (1, 2)	0.47866868440245286
  (1, 0)	0.47866868440245286
  (1, 1)	0.47866868440245286
  (1, 6)	0.23933434220122643


In [12]:
bow_vec.toarray()

array([[0.50429039, 0.50429039, 0.50429039, 0.33154063, 0.        ,
        0.        , 0.2521452 , 0.2521452 ],
       [0.47866868, 0.47866868, 0.47866868, 0.        , 0.3146959 ,
        0.3146959 , 0.23933434, 0.23933434],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ]])

In [13]:
tfidf_vectorizer.get_feature_names()

['おかず', 'ごはん', 'クーポン', 'ディナー', 'ランチ', '昼飯', '無料', '食事']

In [14]:
tfidf_vectorizer.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [16]:
import pandas as pd
df = pd.DataFrame({'A': [0, 1, 2], 'B': [2, 3, 4]})
df

Unnamed: 0,A,B
0,0,2
1,1,3
2,2,4


In [17]:
d = df.to_dict(orient='dict')
d

{'A': {0: 0, 1: 1, 2: 2}, 'B': {0: 2, 1: 3, 2: 4}}

In [19]:
l = df.to_dict(orient='list')
l

{'A': [0, 1, 2], 'B': [2, 3, 4]}

In [20]:
s = df.to_dict(orient='series')
s

{'A': 0    0
 1    1
 2    2
 Name: A, dtype: int64,
 'B': 0    2
 1    3
 2    4
 Name: B, dtype: int64}

In [21]:
user_tbl = pd.read_csv('../data/020_intermediate/users.csv')
user_tbl

Unnamed: 0,uid,uname
0,USLACKBOT,Slackbot
1,UJKFAPBCJ,岡村龍弥
2,UJRAL005U,村上 智之
3,UJRDMQSAD,勝又健太 kenta.katsumata
4,UKEMYD9R8,tetsuroito
...,...,...
159,UV1GRG3UZ,Kazuhii
160,UV1J8NN6B,funakoshi
161,UV37036U8,Yuna
162,UV92NJEJH,奥津 勝太


In [22]:
uid = 'USLACKBOT'
target = user_tbl.query('uid == @uid')
target

Unnamed: 0,uid,uname
0,USLACKBOT,Slackbot


In [23]:
type(target)

pandas.core.frame.DataFrame

In [26]:
target.iloc[0]['uname']

'Slackbot'