In [1]:
import pandas as pd
raw_data = pd.read_csv('train_2022.csv')
raw_data

Unnamed: 0,row_id,TEXT,LABEL
0,0,director dirk shafer and co-writer greg hinton...,0
1,1,"a charming , quirky and leisurely paced scotti...",1
2,2,"the price was good , and came quickly though ...",1
3,3,i was looking forward to this game for a coupl...,0
4,4,arguably the year 's silliest and most incoher...,0
...,...,...,...
1995,1995,an imaginative comedy\/thriller .,1
1996,1996,a savvy exploration of paranoia and insecurity...,1
1997,1997,on the other hand for power grating you ve got...,1
1998,1998,"like dickens with his passages , mcgrath craft...",1


In [2]:
test_data = pd.read_csv('test_no_answer_2022.csv')
test_data

Unnamed: 0,row_id,TEXT
0,0,good to know if you can t find these elsewhere .
1,1,love it ! the grill plates come out and pop i...
2,2,i m convinced this was a poorly executed refur...
3,3,i would never have complained about that if it...
4,4,"the photo shows the same whole , large candie..."
...,...,...
10995,10995,i didn t quite get it the first time .
10996,10996,i ve tried installing with and without the oem...
10997,10997,i was parked at a truck stop in the cincinnati...
10998,10998,i recently bought this case after seeing some ...


# 加入情緒字典分析

In [3]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
def append_sentiment_dictionary(df):
    sid = SentimentIntensityAnalyzer()
    df['sentiment'] = df['TEXT'].apply(lambda x: sid.polarity_scores(x))
    df = pd.concat([df.drop(['sentiment'], axis=1), df['sentiment'].apply(pd.Series)], axis=1)
    return df

# 加入TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

def append_tfidf(df):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['TEXT'])
    X_tfidf_df = pd.DataFrame(list(tfidf_matrix.toarray()), columns=['tfidf_'+str(i) for i in range(tfidf_matrix.shape[1])])
    df = pd.concat([df, X_tfidf_df], axis=1)
    return df

### 情緒字典 + SVM

In [6]:
%%time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

train_data = raw_data.copy()
train_data = append_sentiment_dictionary(train_data)
# train_data = append_tfidf(train_data)

X = train_data.drop(columns=['TEXT','LABEL'])

y = train_data['LABEL']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = SVC(kernel='linear')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.625
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.53      0.57       189
           1       0.63      0.71      0.67       211

    accuracy                           0.62       400
   macro avg       0.62      0.62      0.62       400
weighted avg       0.62      0.62      0.62       400

CPU times: total: 19.2 s
Wall time: 43.7 s


### TF-IDF + 情緒字典 + SVM

In [7]:
%%time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

train_data = raw_data.copy()
train_data = append_sentiment_dictionary(train_data)
train_data = append_tfidf(train_data)

X = train_data.drop(columns=['TEXT','LABEL'])

y = train_data['LABEL']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = SVC(kernel='linear')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.635
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.46      0.54       189
           1       0.62      0.80      0.70       211

    accuracy                           0.64       400
   macro avg       0.64      0.63      0.62       400
weighted avg       0.64      0.64      0.62       400

CPU times: total: 3min 29s
Wall time: 9min 18s


# 測試集
### 全1: 應該差不多
### 全0: 0.50055

In [9]:
predict_data = test_data.copy()
predict_data['LABEL'] = 0
predict_data = predict_data.drop(columns=['TEXT'])
predict_data

Unnamed: 0,row_id,LABEL
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
10995,10995,0
10996,10996,0
10997,10997,0
10998,10998,0


In [5]:
import datetime
import pytz
def export_csv(df,name):
  now = datetime.datetime.now().astimezone(pytz.timezone('Asia/Taipei'))
  formatted_time = now.strftime('%Y-%m-%d %H時%M分%S秒')
  df.to_csv('predict_data/'+ name + '_' + formatted_time + ".csv", index=False,encoding="utf_8_sig")

In [10]:
export_csv(predict_data,'all_zero')