In [3]:
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from googleapiclient import errors
from email.message import EmailMessage
import base64
import google.auth

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# Load mails from gmail

In [4]:
# If modifying these scopes, delete the file token.json.
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

In [5]:
def get_credential():
    """
    Returns credential
    """
    creds = None
    # The file token.json stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.json', 'w') as token:
            token.write(creds.to_json())

    try:
        return creds

    except HttpError as error:
        # TODO(developer) - Handle errors from gmail API.
        print(f'An error occurred: {error}')

In [6]:
creds = get_credential()

In [1]:
def get_threads():
    try:
        # create gmail api client
        service = build('gmail', 'v1', credentials=creds)
        service.close
        # pylint: disable=maybe-no-member
        # pylint: disable:R1710
        threads = service.users().threads().list(userId='me', includeSpamTrash=True, maxResults=500).execute().get('threads', [])    
        return threads

    except HttpError as error:
        print(F'An error occurred: {error}')

In [75]:
threads = get_threads()
#threads

In [74]:
category_placeholders = ['spam' for _ in range(len(threads))]
#category_placeholders

In [54]:
id_list = []
snippet_list = []

for thread in threads:
    id_list.append(thread['id'])
    snippet_list.append(thread['snippet'])


In [55]:
data = {
    'Category': category_placeholders,
    'Message': snippet_list,
    'id': id_list
}

In [56]:
df = pd.DataFrame(data)
df.to_csv('csv_to_work_on.csv', index=False)

수작업으로 훈련 데이터 라벨링할 것

--------


# Pre-Processing Data

In [4]:
# loading the data from csv file to a pandas Dataframe
raw_mail_data = pd.read_csv('./labeled_data.csv')

In [5]:
# replace the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

### Checking Data

In [6]:
mail_data.head()

Unnamed: 0,Category,Message,id
0,spam,김민규 Stories for 김민규 @gu79917991·Become a membe...,18a2998e7b889798
1,spam,r/MediumApp: Kindly give your valuable feedbac...,18a27824173ae6c6
2,spam,마지막으로 LinkedIn을 방문하신 후 있었던 일들 ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ...,18a273469855133a
3,spam,김민규 Stories for 김민규 @gu79917991·Become a membe...,18a24728eeacdd7e
4,spam,r/MediumApp: r/mediumapp is open for posting n...,18a22648eb7ee9c6


In [7]:
# checking the number of rows and columns in the dataframe
mail_data.shape

(500, 3)

### Labeling Data
spam = 0
ham = 1

In [8]:
# label spam mail as 0;  ham mail as 1;

mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 1

### Get X and Y

In [9]:
# separating the data as texts and label

X = mail_data['Message']

Y = mail_data['Category']

In [82]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

### Feature Extraction

In [83]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

# feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)
feature_extraction = TfidfVectorizer(min_df = 1, lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
# X_train_features = feature_extraction.fit_transform()
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [84]:
# print(X_train_features)

# Training Data

In [183]:
model = DecisionTreeClassifier(max_depth=8, random_state=1)

In [184]:
model.fit(X_train_features, Y_train)

# Evaluate Result

In [185]:
# prediction on training data

prediction_on_training_data = model.predict(X_test_features)
accuracy_on_training_data = accuracy_score(Y_test, prediction_on_training_data)

In [186]:
print(prediction_on_training_data)

[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0
 0 0 1 0 0 0 1 0 1 0 0 0 1 1 0 1 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
 0 0]


In [187]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.8533333333333334


In [188]:
from sklearn.metrics import confusion_matrix

In [189]:
cm = confusion_matrix(Y_test, prediction_on_training_data)
cm

array([[118,  12],
       [ 10,  10]])