In [0]:
# import libraries
from google.colab import drive
drive.mount('/content/gdrive')
import nltk, re, time
from nltk.corpus import stopwords
nltk.download('stopwords')
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.ensemble import AdaBoostClassifier
import re

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using TensorFlow backend.


In [0]:
data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/two_sigma_dataset/news_sample.csv')

In [0]:
# load data
data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/two_sigma_dataset/news_sample.csv')

# select the relevant columns
df = data[['headline','sentimentClass']]

# build a function that cleanse the data
def clean_text(headline):
  """cleaning data"""
  headline = headline.lower().split()
  stopword = set(stopwords.words("english"))
  headline = [word for word in headline if not word in stopword]
  headline = " ".join(headline)
  headline = re.sub('[^a-zA-z0-9\s]','',headline)
  return headline

# apply the cleaning data function to the 'headline' columns of df
df['headline'] = df['headline'].apply(clean_text)

# build a function that tokenize the headlines
def tokenize(feature):
  max_features = 50
  tokenizer = Tokenizer(num_words=max_features, split=' ')
  tokenizer.fit_on_texts(feature.values)
  X = tokenizer.texts_to_sequences(feature.values)
  X = pad_sequences(X)
  return X

# build a function that categorise the target variable
def categorize(target):
  if target == 0:
    target = 'Neutral'
  elif target == 1:
    target = 'Positive'
  else:
    target = 'Negative'   
  return target

X = df['headline']
y = df['sentimentClass']

# tokenize the headlines which will be vectorised so that it can be fed in the neural network
X = tokenize(X)

# categorise the target and then do one-hot-encoding so that it can be fed in the neural network
y = y.apply(categorize)
y = pd.factorize(y)[0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [0]:
# import libraries for machine learning
import sklearn
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import numpy as np

In [0]:
# build pipelines with hyperparameters

pipelines = {
    'rf' : make_pipeline(RandomForestClassifier(random_state=123)),
    'gb' : make_pipeline(GradientBoostingClassifier(random_state=123)),
    'xgb': make_pipeline(XGBClassifier(random_state=123)),
    'ada': make_pipeline(AdaBoostClassifier(random_state=123))
}

rf_hyperparameters = {'randomforestclassifier__n_estimators':[int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]}
gb_hyperparameters = {'gradientboostingclassifier__n_estimators':[int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]}
xgb_hyperparameters = {'xgbclassifier__n_estimators':[int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] }
ada_hyperparameters = {'adaboostclassifier__n_estimators':[int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]}

hyperparameters = {
    'rf' : rf_hyperparameters,
    'gb' : gb_hyperparameters,
    'xgb': xgb_hyperparameters,
    'ada':ada_hyperparameters
}

In [0]:
# adopt gridsearchCV to tune hyperparameters
fitted_models = {}

for name, pipeline in pipelines.items():
    model = GridSearchCV(pipeline, hyperparameters[name], cv=5, n_jobs=-1)
    model.fit(X_train, y_train)
    fitted_models[name] = model
    print(name, 'has been fitted.')



rf has been fitted.




gb has been fitted.




xgb has been fitted.




ada has been fitted.


In [0]:
print('The fitted best score for each model: \n')
for name, model in fitted_models.items():
  print(name, model.best_score_)
  
print('\n', '-----'*10, '\n')

print('The accuracy  score for each model: \n')
for name, model in fitted_models.items():
  print(name, accuracy_score(y_test, model.predict(X_test)))

The fitted best score for each model: 

rf 0.5571428571428572
gb 0.5428571428571428
xgb 0.5
ada 0.45714285714285713

 -------------------------------------------------- 

The accuracy  score for each model: 

rf 0.4
gb 0.4
xgb 0.4
ada 0.5
