In [3]:
import pandas as pd
raw_data = pd.read_csv('train_2022.csv')
raw_data

Unnamed: 0,row_id,TEXT,LABEL
0,0,director dirk shafer and co-writer greg hinton...,0
1,1,"a charming , quirky and leisurely paced scotti...",1
2,2,"the price was good , and came quickly though ...",1
3,3,i was looking forward to this game for a coupl...,0
4,4,arguably the year 's silliest and most incoher...,0
...,...,...,...
1995,1995,an imaginative comedy\/thriller .,1
1996,1996,a savvy exploration of paranoia and insecurity...,1
1997,1997,on the other hand for power grating you ve got...,1
1998,1998,"like dickens with his passages , mcgrath craft...",1


In [4]:
test_data = pd.read_csv('test_no_answer_2022.csv')
test_data

Unnamed: 0,row_id,TEXT
0,0,good to know if you can t find these elsewhere .
1,1,love it ! the grill plates come out and pop i...
2,2,i m convinced this was a poorly executed refur...
3,3,i would never have complained about that if it...
4,4,"the photo shows the same whole , large candie..."
...,...,...
10995,10995,i didn t quite get it the first time .
10996,10996,i ve tried installing with and without the oem...
10997,10997,i was parked at a truck stop in the cincinnati...
10998,10998,i recently bought this case after seeing some ...


# Analysis Data

In [5]:
label_counts = raw_data['LABEL'].value_counts()
count_0 = label_counts.get(0, 0)
count_1 = label_counts.get(1, 0)

print("label 0 counts:", count_0)
print("label 1 counts:", count_1)

label 0 counts: 1000
label 1 counts: 1000


In [6]:
raw_data.describe()

Unnamed: 0,row_id,LABEL
count,2000.0,2000.0
mean,999.5,0.5
std,577.494589,0.500125
min,0.0,0.0
25%,499.75,0.0
50%,999.5,0.5
75%,1499.25,1.0
max,1999.0,1.0


In [16]:
# alr install
# pip install nltk
# import nltk
# nltk.download('vader_lexicon')

Note: you may need to restart the kernel to use updated packages.


## 加入情緒字典分析

In [7]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# 初始化情感分析器
sid = SentimentIntensityAnalyzer()

train_data = raw_data.copy()
train_data['sentiment'] = train_data['TEXT'].apply(lambda x: sid.polarity_scores(x))

# 將情感分析結果擴展為多個欄位
train_data = pd.concat([train_data.drop(['sentiment'], axis=1), train_data['sentiment'].apply(pd.Series)], axis=1)

# 印出結果
train_data

Unnamed: 0,row_id,TEXT,LABEL,neg,neu,pos,compound
0,0,director dirk shafer and co-writer greg hinton...,0,0.190,0.750,0.060,-0.3818
1,1,"a charming , quirky and leisurely paced scotti...",1,0.168,0.647,0.185,0.1531
2,2,"the price was good , and came quickly though ...",1,0.000,0.775,0.225,0.4404
3,3,i was looking forward to this game for a coupl...,0,0.000,1.000,0.000,0.0000
4,4,arguably the year 's silliest and most incoher...,0,0.185,0.648,0.167,-0.0516
...,...,...,...,...,...,...,...
1995,1995,an imaginative comedy\/thriller .,1,0.000,1.000,0.000,0.0000
1996,1996,a savvy exploration of paranoia and insecurity...,1,0.447,0.447,0.106,-0.7269
1997,1997,on the other hand for power grating you ve got...,1,0.096,0.685,0.219,0.4215
1998,1998,"like dickens with his passages , mcgrath craft...",1,0.000,0.769,0.231,0.5859


## 加入TF-IDF

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

# 假设train_data是你的原始数据
train_data = train_data.copy()

# 对TEXT列进行TF-IDF转换
tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['TEXT'])

# 将TF-IDF结果添加到新的列中
train_data['TF-IDF'] = list(tfidf_matrix.toarray())

train_data

Unnamed: 0,row_id,TEXT,LABEL,neg,neu,pos,compound,TF-IDF
0,0,director dirk shafer and co-writer greg hinton...,0,0.190,0.750,0.060,-0.3818,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,"a charming , quirky and leisurely paced scotti...",1,0.168,0.647,0.185,0.1531,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,"the price was good , and came quickly though ...",1,0.000,0.775,0.225,0.4404,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,i was looking forward to this game for a coupl...,0,0.000,1.000,0.000,0.0000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,arguably the year 's silliest and most incoher...,0,0.185,0.648,0.167,-0.0516,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...
1995,1995,an imaginative comedy\/thriller .,1,0.000,1.000,0.000,0.0000,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1996,1996,a savvy exploration of paranoia and insecurity...,1,0.447,0.447,0.106,-0.7269,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1997,1997,on the other hand for power grating you ve got...,1,0.096,0.685,0.219,0.4215,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1998,1998,"like dickens with his passages , mcgrath craft...",1,0.000,0.769,0.231,0.5859,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### TF-IDF + 情緒字典 + SVM

In [10]:
%%time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

X_tfidf = train_data['TF-IDF']
X_numeric = train_data[['neg', 'neu', 'pos', 'compound']]
y = train_data['LABEL']

# 将X_tfidf转换为DataFrame对象
X_tfidf_df = pd.DataFrame(X_tfidf.tolist(), columns=['tfidf_'+str(i) for i in range(tfidf_matrix.shape[1])])

# 合并特征
X = pd.concat([X_numeric, X_tfidf_df], axis=1)
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 建立并训练模型
model = SVC(kernel='linear')  # 使用线性核心的支持向量机
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)

# 评估模型
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.6725
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.73      0.68       189
           1       0.72      0.62      0.67       211

    accuracy                           0.67       400
   macro avg       0.68      0.68      0.67       400
weighted avg       0.68      0.67      0.67       400

CPU times: total: 29.9 s
Wall time: 1min 52s


### 情緒字典 + SVM

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# 載入資料集
data = train_data.copy()

# 提取特徵和目標變數
X = data[['neg', 'neu', 'pos', 'compound']]
y = data['LABEL']

# 分割資料集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 建立並訓練模型
model = SVC(kernel='linear')  # 使用線性核心的支援向量機
model.fit(X_train, y_train)

# 預測
y_pred = model.predict(X_test)

# 評估模型
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.6275
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.71      0.64       189
           1       0.68      0.55      0.61       211

    accuracy                           0.63       400
   macro avg       0.63      0.63      0.63       400
weighted avg       0.64      0.63      0.63       400



# 測試集
### 全1: 應該差不多
### 全0: 0.50055

In [9]:
predict_data = test_data.copy()
predict_data['LABEL'] = 0
predict_data = predict_data.drop(columns=['TEXT'])
predict_data

Unnamed: 0,row_id,LABEL
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
10995,10995,0
10996,10996,0
10997,10997,0
10998,10998,0


In [5]:
import datetime
import pytz
def export_csv(df,name):
  now = datetime.datetime.now().astimezone(pytz.timezone('Asia/Taipei'))
  formatted_time = now.strftime('%Y-%m-%d %H時%M分%S秒')
  df.to_csv('predict_data/'+ name + '_' + formatted_time + ".csv", index=False,encoding="utf_8_sig")

In [10]:
export_csv(predict_data,'all_zero')