# 简易情感分析

我们来结合一下sklearn，pandas和刚讲过的工具库，来构建一个简易情感分析模型。

In [1]:
import numpy as np
import pandas as pd

## 加载数据

In [2]:
data = pd.read_csv("./data/emotion_data.csv")

In [3]:
data.shape

(40000, 4)

In [4]:
data.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [5]:
# 不同的情感种类
data.sentiment.unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

## 数据预处理

In [6]:
# 去掉无关列
data = data.drop(data.columns[[0,2]], axis=1)

In [7]:
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [10]:
# dataset = data.as_matrix() 失效
dataset = data.iloc[:,:].values

In [11]:
dataset.shape

(40000, 2)

In [14]:
features = dataset[:,1]
print(features.shape)

(40000,)


In [15]:
features[123]

'@poinktoinkdoink He died.  Wait, what about Magic Jack? I just read it.'

In [16]:
target = dataset[:,0]

In [17]:
# 使用LabelEncoder对不同的情感target进行编码
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
target_processed = le.fit_transform(target)

In [18]:
le.classes_

array(['anger', 'boredom', 'empty', 'enthusiasm', 'fun', 'happiness',
       'hate', 'love', 'neutral', 'relief', 'sadness', 'surprise',
       'worry'], dtype=object)

In [19]:
# 对输入的文本进行特征抽取和表示(这里用到的tf-idf特征在后面的课程中会讲到)
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_processed = tfidf.fit_transform(features)

In [20]:
X_processed

<40000x48212 sparse matrix of type '<class 'numpy.float64'>'
	with 475946 stored elements in Compressed Sparse Row format>

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_processed, target_processed, test_size=0.5, random_state=42)

In [22]:
y_train

array([ 3,  5, 10, ...,  4,  6,  7])

In [24]:
X_train.shape

(20000, 48212)

## 模型训练

In [25]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
# 模型评估
lr.score(X_test, y_test)

0.3483

In [27]:
# 模型预测
test_ex = ["It is so horrible"]
text_ex_processed = tfidf.transform(test_ex)
lr.predict(text_ex_processed)

array([12])

In [28]:
print(lr.predict(text_ex_processed))

[12]
