In [242]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack, csr_matrix
from textblob import TextBlob
import nltk
import re
import datetime
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVR
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from wordcloud import WordCloud, STOPWORDS 

%matplotlib inline

In [243]:
train_df = pd.read_csv('train_file.csv')
test_df = pd.read_csv('test_file.csv')
test_id = test['IDLink']

In [244]:
train_df.head()

Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline
0,Tr3CMgRv1N,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,USA TODAY,obama,2002-04-02 00:00:00,-1,-1,-1,0.0,-0.0533
1,Wc81vGp8qZ,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit...",Bloomberg,economy,2008-09-20 00:00:00,-1,-1,-1,0.208333,-0.156386
2,zNGH03CrZH,Nouriel Roubini: Global Economy Not Back to 2008,"Nouriel Roubini, NYU professor and chairman at...",Bloomberg,economy,2012-01-28 00:00:00,-1,-1,-1,-0.42521,0.139754
3,3sM1H0W8ts,Finland GDP Expands In Q4,Finland's economy expanded marginally in the t...,RTT News,economy,2015-03-01 00:06:00,-1,-1,-1,0.0,0.026064
4,wUbnxgvqaZ,"Tourism, govt spending buoys Thai economy in J...",Tourism and public spending continued to boost...,The Nation - Thailand&#39;s English news,economy,2015-03-01 00:11:00,-1,-1,-1,0.0,0.141084


In [245]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55932 entries, 0 to 55931
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   IDLink             55932 non-null  object 
 1   Title              55932 non-null  object 
 2   Headline           55932 non-null  object 
 3   Source             55757 non-null  object 
 4   Topic              55932 non-null  object 
 5   PublishDate        55932 non-null  object 
 6   Facebook           55932 non-null  int64  
 7   GooglePlus         55932 non-null  int64  
 8   LinkedIn           55932 non-null  int64  
 9   SentimentTitle     55932 non-null  float64
 10  SentimentHeadline  55932 non-null  float64
dtypes: float64(2), int64(3), object(6)
memory usage: 4.7+ MB


In [246]:
train_df.nunique() 

IDLink               55932
Title                48963
Headline             52112
Source                4753
Topic                    4
PublishDate          49602
Facebook              2166
GooglePlus             273
LinkedIn               648
SentimentTitle       10014
SentimentHeadline    27265
dtype: int64

In [247]:
train_df.isnull().sum()

IDLink                 0
Title                  0
Headline               0
Source               175
Topic                  0
PublishDate            0
Facebook               0
GooglePlus             0
LinkedIn               0
SentimentTitle         0
SentimentHeadline      0
dtype: int64

In [248]:
test_df.isnull().sum()

IDLink           0
Title            0
Headline         0
Source         101
Topic            0
PublishDate      0
Facebook         0
GooglePlus       0
LinkedIn         0
dtype: int64

In [249]:
train_df['Source'].value_counts()[:5]

Bloomberg         992
Reuters           763
ABC News          645
New York Times    573
The Guardian      551
Name: Source, dtype: int64

In [250]:
test_df['Source'].value_counts()[:5]

Bloomberg          740
Reuters            558
ABC News           453
New York Times     419
MSPoweruser.com    416
Name: Source, dtype: int64

In [251]:
train_df['Source'] = train_df['Source'].fillna('Bloomberg')
test_df['Source'] = test_df['Source'].fillna('Bloomberg')

In [252]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop = set(stopwords.words('english'))

def clean(text):
  text_token = word_tokenize(text)
  filtered_text = ' '.join([w.lower() for w in text_token if w.lower() not in stop and len(w) > 2])
  filtered_text = filtered_text.replace(r"[^a-zA-Z]+", '')
  text_only = re.sub(r'\b\d+\b', '', filtered_text)
  clean_text = text_only.replace(',', '').replace('.', '').replace(':', '')
  return clean_text

[nltk_data] Downloading package stopwords to C:\Users\Sanskar
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Sanskar
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Sanskar
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [253]:
train_df['Text_Title'] = train_df['Title'] + ' ' + train_df['Source'] + ' ' + train_df['Topic']
test_df['Text_Title'] = test_df['Title'] + ' ' + test_df['Source'] + ' ' + test_df['Topic']

train_df['Text_Headline'] = train_df['Headline'] + ' ' + train_df['Source'] + ' ' + train_df['Topic']
test_df['Text_Headline'] = test_df['Headline'] + ' ' + test_df['Source'] + ' ' + test_df['Topic']

In [254]:
train_df['Text_Title'][4]

'Tourism, govt spending buoys Thai economy in January The Nation - Thailand&#39;s English news economy'

In [255]:
train_df['Text_Title'] = [clean(x) for x in train_df['Text_Title']]
test_df['Text_Title'] = [clean(x) for x in test_df['Text_Title']]

train_df['Text_Headline'] = [clean(x) for x in train_df['Text_Headline']]
test_df['Text_Headline'] = [clean(x) for x in test_df['Text_Headline']]

In [256]:
train_df['Text_Title'][4]

'tourism govt spending buoys thai economy january nation thailand english news economy'

In [411]:
vectorizer = TfidfVectorizer(use_idf=True)

train_v_Title = vectorizer.fit_transform(train_df['Text_Title'])
test_v_Title = vectorizer.transform(test_df['Text_Title'])

vectorizer_ = TfidfVectorizer()

train_v_Headline = vectorizer_.fit_transform(train_df['Text_Headline'])
test_v_Headline = vectorizer_.transform(test_df['Text_Headline'])

In [412]:
svd = TruncatedSVD(n_components=20)

train_v_Title = svd.fit_transform(train_v_Title)
test_v_Title = svd.transform(test_v_Title)

train_v_Headline = svd.fit_transform(train_v_Headline)
test_v_Headline = svd.transform(test_v_Headline)

In [310]:
print(train_v_Title)

  (0, 602)	0.3994843401495838
  (0, 576)	0.542936416844237
  (0, 926)	0.5576218975612863
  (0, 887)	0.4844482723673839
  (1, 506)	0.5284223537642353
  (1, 384)	0.5119178713761268
  (1, 149)	0.46609262810950425
  (1, 244)	0.320031564096322
  (1, 94)	0.37289055902776347
  (2, 244)	0.4129045437112839
  (2, 94)	0.48110318919437406
  (2, 353)	0.4856626508532062
  (2, 60)	0.6018150452554468
  (3, 244)	0.20043051470250942
  (3, 342)	0.6436248989412622
  (3, 271)	0.6800464215596224
  (3, 588)	0.28829058736436336
  (4, 244)	0.22601424249109275
  (4, 588)	0.16254455769167617
  (4, 893)	0.41721320344866597
  (4, 363)	0.40989879925258926
  (4, 811)	0.3624326124288049
  (4, 461)	0.40202980581305553
  (4, 575)	0.3572165646873905
  (4, 255)	0.3997798253967682
  :	:
  (55926, 389)	0.42476727554515387
  (55927, 602)	0.3488405199831052
  (55927, 949)	0.4027834187494174
  (55927, 833)	0.4071930559377812
  (55927, 470)	0.3862427984652355
  (55927, 940)	0.4385167027647823
  (55927, 200)	0.4569345975350807


In [311]:
train_df['polarity_t'] = train_df['Title'].apply(lambda x: TextBlob(x).sentiment.polarity)
test_df['polarity_t'] = test_df['Title'].apply(lambda x: TextBlob(x).sentiment.polarity)

train_df['subjectivity_t'] = train_df['Title'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
test_df['subjectivity_t'] = test_df['Title'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [262]:
train_df['polarity_h'] = train_df['Headline'].apply(lambda x: TextBlob(x).sentiment.polarity)
test_df['polarity_h'] = test_df['Headline'].apply(lambda x: TextBlob(x).sentiment.polarity)

train_df['subjectivity_h'] = train_df['Headline'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
test_df['subjectivity_h'] = test_df['Headline'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [263]:
encoder = LabelEncoder()

train_df['Topic'] = encoder.fit_transform(train_df['Topic'])
test_df['Topic'] = encoder.transform(test_df['Topic'])

total = train_df['Source'].to_list() + test_df['Source'].to_list()
total = encoder.fit_transform(total)
train_df['Source'] = encoder.transform(train_df['Source'])
test_df['Source'] = encoder.transform(test_df['Source'])

In [264]:

# Get day-type(monday, tuesday) from datetime

train_weekday = []
test_weekday = []

for i in train_df['PublishDate']:
    train_weekday.append(datetime.datetime.strptime(i, "%Y-%m-%d %H:%M:%S").strftime("%A"))
    
for i in test_df['PublishDate']:
    test_weekday.append(datetime.datetime.strptime(i, "%Y-%m-%d %H:%M:%S").strftime("%A"))

train_df['weekday'] = train_weekday
test_df['weekday'] = test_weekday


# convert weekday to 0-6

train_df['weekday'] = train_df['weekday'].map({'Monday': 0,
                                        'Tuesday': 1,
                                        'Wednesday': 2,
                                        'Thursday': 3,
                                        'Friday': 4,
                                        'Saturday': 5,
                                        'Sunday': 6})
test_df['weekday'] = test_df['weekday'].map({'Monday': 0,
                                        'Tuesday': 1,
                                        'Wednesday': 2,
                                        'Thursday': 3,
                                        'Friday': 4,
                                        'Saturday': 5,
                                        'Sunday': 6})

In [265]:
# Hour from date

train_df["hour"] = train_df["PublishDate"].apply(lambda x: x.split()[1].split(':')[0])
test_df["hour"] = test_df["PublishDate"].apply(lambda x: x.split()[1].split(':')[0])

In [270]:
# Number of words in the Title 
train_df["num_words_t"] = train_df["Text_Title"].apply(lambda x: len(str(x).split()))
test_df["num_words_t"] = test_df["Text_Title"].apply(lambda x: len(str(x).split()))

# Number of unique words in the Title 
train_df["num_unique_words_t"] = train_df["Text_Title"].apply(lambda x: len(set(str(x).split())))
test_df["num_unique_words_t"] = test_df["Text_Title"].apply(lambda x: len(set(str(x).split())))

# Number of characters in the Title 
train_df["num_chars_t"] = train_df["Text_Title"].apply(lambda x: len(str(x)))
test_df["num_chars_t"] = test_df["Text_Title"].apply(lambda x: len(str(x)))

# Average length of the words in the Title 
train_df["mean_word_len_t"] = train_df["Text_Title"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_df["mean_word_len_t"] = test_df["Text_Title"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [271]:

# Number of words in the Headline 
train_df["num_words_h"] = train_df["Text_Headline"].apply(lambda x: len(str(x).split()))
test_df["num_words_h"] = test_df["Text_Headline"].apply(lambda x: len(str(x).split()))

# Number of unique words in the Headline 
train_df["num_unique_words_h"] = train_df["Text_Headline"].apply(lambda x: len(set(str(x).split())))
test_df["num_unique_words_h"] = test_df["Text_Headline"].apply(lambda x: len(set(str(x).split())))

# Number of characters in the Headline 
train_df["num_chars_h"] = train_df["Text_Headline"].apply(lambda x: len(str(x)))
test_df["num_chars_h"] = test_df["Text_Headline"].apply(lambda x: len(str(x)))

# Average length of the words in the Headline 
train_df["mean_word_len_h"] = train_df["Text_Headline"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_df["mean_word_len_h"] = test_df["Text_Headline"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [272]:
scaler = StandardScaler()

cols = ['Source', 'Topic', 'Facebook', 'GooglePlus', 'LinkedIn', 'num_words_t', 'num_unique_words_t', 'num_chars_t', 'mean_word_len_t',
        'num_words_h', 'num_unique_words_h', 'num_chars_h', 'mean_word_len_h', 'hour', 'weekday']

for col in cols:
    train_df[col] = scaler.fit_transform(train_df[col].values.reshape(-1, 1))
    test_df[col] = scaler.transform(test_df[col].values.reshape(-1, 1))

In [291]:
cols_t = ['Source', 'Topic', 'Facebook', 'GooglePlus', 'LinkedIn', 'num_words_t', 'num_unique_words_t', 'mean_word_len_t','num_chars_t', 'polarity_t', 'subjectivity_t', 'hour', 'weekday']
train_X1 = train[cols_t]
test_X1 = test[cols_t]

cols_h = ['Source', 'Topic', 'Facebook', 'GooglePlus', 'LinkedIn', 'num_words_t', 'num_unique_words_t', 'mean_word_len_t','num_chars_t', 'polarity_t', 'subjectivity_t','num_words_h', 'num_unique_words_h', 'mean_word_len_h','num_chars_h', 'polarity_h', 'subjectivity_h', 'hour', 'weekday']
train_X2 = train[cols_h]
test_X2 = test[cols_h]

In [292]:
train_X1.head()

Unnamed: 0,Source,Topic,Facebook,GooglePlus,LinkedIn,num_words_t,num_unique_words_t,mean_word_len_t,num_chars_t,polarity_t,subjectivity_t,hour,weekday
0,1.2577,0.841443,-0.184044,-0.262649,-0.199608,-0.259144,-0.304659,-0.612899,-0.572668,0.0,0.0,-1.699073,-0.795924
1,-1.314091,-1.108773,-0.184044,-0.262649,-0.199608,-1.668699,-1.721681,0.274497,-1.629548,0.0,0.0,-1.699073,1.338369
2,-1.314091,-1.108773,-0.184044,-0.262649,-0.199608,-1.198848,-1.24934,0.328828,-1.035053,0.0,0.0,-1.699073,1.338369
3,0.468412,-1.108773,-0.184044,-0.262649,-0.199608,-1.668699,-1.24934,-1.436911,-2.224043,0.0,0.0,-1.699073,1.871942
4,1.029071,-1.108773,-0.184044,-0.262649,-0.199608,1.150411,1.112364,-0.295972,1.012652,0.0,0.0,-1.699073,1.871942


In [293]:
print(csr_matrix(train_X1.values))

  (0, 0)	1.2576997634688756
  (0, 1)	0.841443095428152
  (0, 2)	-0.1840444807723746
  (0, 3)	-0.2626493550487806
  (0, 4)	-0.1996084239198007
  (0, 5)	-0.259144234496245
  (0, 6)	-0.3046586928529652
  (0, 7)	-0.6128993994536583
  (0, 8)	-0.5726681178922972
  (0, 11)	-1.6990733189086988
  (0, 12)	-0.7959239487209263
  (1, 0)	-1.3140910230329217
  (1, 1)	-1.1087732192299813
  (1, 2)	-0.1840444807723746
  (1, 3)	-0.2626493550487806
  (1, 4)	-0.1996084239198007
  (1, 5)	-1.668699356916161
  (1, 6)	-1.7216813679265908
  (1, 7)	0.274497439940456
  (1, 8)	-1.6295483600819824
  (1, 11)	-1.6990733189086988
  (1, 12)	1.3383691151678876
  (2, 0)	-1.3140910230329217
  (2, 1)	-1.1087732192299813
  (2, 2)	-0.1840444807723746
  :	:
  (55929, 11)	-1.5553969654148538
  (55929, 12)	-0.7959239487209263
  (55930, 0)	0.7923992191163056
  (55930, 1)	-0.13366506190091465
  (55930, 2)	-0.18266121113151082
  (55930, 3)	-0.2153389244819818
  (55930, 4)	-0.1865622345169816
  (55930, 5)	-1.1988476494428557
  (559

In [294]:
train_X2.head()

Unnamed: 0,Source,Topic,Facebook,GooglePlus,LinkedIn,num_words_t,num_unique_words_t,mean_word_len_t,num_chars_t,polarity_t,subjectivity_t,num_words_h,num_unique_words_h,mean_word_len_h,num_chars_h,polarity_h,subjectivity_h,hour,weekday
0,1.2577,0.841443,-0.184044,-0.262649,-0.199608,-0.259144,-0.304659,-0.612899,-0.572668,0.0,0.0,-0.285943,-0.526753,-1.058531,-0.490533,0.0,0.0,-1.699073,-0.795924
1,-1.314091,-1.108773,-0.184044,-0.262649,-0.199608,-1.668699,-1.721681,0.274497,-1.629548,0.0,0.0,-0.285943,-0.232503,-0.175891,-0.317671,0.1,0.2,-1.699073,1.338369
2,-1.314091,-1.108773,-0.184044,-0.262649,-0.199608,-1.198848,-1.24934,0.328828,-1.035053,0.0,0.0,-0.413305,-0.673878,0.618485,-0.300385,0.0,0.041667,-1.699073,1.338369
3,0.468412,-1.108773,-0.184044,-0.262649,-0.199608,-1.668699,-1.24934,-1.436911,-2.224043,0.0,0.0,0.096143,0.061747,0.843558,0.304632,-0.166667,0.166667,-1.699073,1.871942
4,1.029071,-1.108773,-0.184044,-0.262649,-0.199608,1.150411,1.112364,-0.295972,1.012652,0.0,0.0,0.223505,0.208872,0.618485,0.391063,0.133333,0.380556,-1.699073,1.871942


In [312]:
print(np.shape(train_X1))
print(np.shape(test_X1))

(55932, 13)
(37288, 13)


In [313]:

print(np.shape(train_X2))
print(np.shape(test_X2))

(55932, 19)
(37288, 19)


In [366]:
print(np.shape(train_v_Title))
print(np.shape(test_v_Title))

(55932, 30)
(37288, 30)


In [367]:
print(np.shape(train_v_Headline))
print(np.shape(test_v_Headline))

(55932, 30)
(37288, 30)


In [413]:
train_X_Title = hstack([train_v_Title, csr_matrix(train_X1.values)])
test_X_Title = hstack([test_v_Title, csr_matrix(test_X1.values)])
y1 = train['SentimentTitle']

train_X_Headline = hstack([train_v_Headline, csr_matrix(train_X2.values)])
test_X_Headline = hstack([test_v_Headline, csr_matrix(test_X2.values)])
y2 = train['SentimentHeadline']

In [414]:
np.shape(train_X_Title)

(55932, 33)

In [415]:
# LinearSVR model for SentimentTitle

X_train, X_test, y_train, y_test = train_test_split(train_X_Title, y1, test_size=0.20, random_state=2)

clf1 = LinearSVR(C=0.2,max_iter=10000)
clf1.fit(X_train, y_train)

y_pred1 = clf1.predict(X_test)
mae1 = mean_absolute_error(y_pred1, y_test)
print('MAE:', 1 - mae1)

MAE: 0.9067386758064184


In [416]:
# LinearSVR model for SentimentHeadline

X_train, X_test, y_train, y_test = train_test_split(train_X_Headline, y2, test_size=0.20, random_state=2)

clf2 = LinearSVR(C=0.1,max_iter=10000)
clf2.fit(X_train, y_train)

y_pred2 = clf2.predict(X_test)
mae2 = mean_absolute_error(y_pred2, y_test)
print('MAE:', 1 - mae2)

MAE: 0.8952747622646386


In [417]:
#using LinearSVR for prediction
print('MAE:', 1 - ((0.4 * mae1) + (0.6 * mae2)))

MAE: 0.8998603276813505


In [81]:
from xgboost import XGBRegressor

In [87]:
#using XGB

X_train, X_test, y_train, y_test = train_test_split(train_X_Title, y1, test_size=0.20, random_state=42)
params = {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 
          'n_estimators': 250, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
clf1 = XGBRegressor(random_state=2, **params)
clf1.fit(X_train, y_train)

y_pred1 = clf1.predict(X_test)
mae1 = mean_absolute_error(y_pred1, y_test)
print('MAE:', 1 - mae1)

MAE: 0.9168106274808605


In [88]:
#using XGB
X_train, X_test, y_train, y_test = train_test_split(train_X_Headline, y2, test_size=0.20, random_state=42)

params = {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 
          'n_estimators': 250, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
clf2 = XGBRegressor(random_state=2, **params)
clf2.fit(X_train, y_train)

y_pred2 = clf2.predict(X_test)
mae2 = mean_absolute_error(y_pred2, y_test)
print('MAE:', 1 - mae2)

MAE: 0.9041842435752636


In [108]:
#using lgb

X_train, X_test, y_train, y_test = train_test_split(train_X_Title, y1, test_size=0.20, random_state=42)

params = {}
params['learning_rate'] = 0.03
#params['boosting_type'] = 'gbdt'
params['metric'] = 'l2'
params['sub_feature'] = 0.55
params['num_leaves'] = 60
params['min_data'] = 30

d_train = lgb.Dataset(X_train, label = y_train)
clf1 = lgb.train(params,d_train, 100)


y_pred1 = clf1.predict(X_test)
mae1 = mean_absolute_error(y_pred1, y_test)
print('MAE:', 1 - mae1)

MAE: 0.9228767995170419


In [95]:
#using lgb
X_train, X_test, y_train, y_test = train_test_split(train_X_Headline, y2, test_size=0.20, random_state=42)

params = {}
params['learning_rate'] = 0.029
#params['boosting_type'] = 'gbdt'
params['metric'] = 'l2'
params['sub_feature'] = 0.55
params['num_leaves'] = 40
params['min_data'] = 50

d_train = lgb.Dataset(X_train, label = y_train)

clf2 = lgb.train(params,d_train, 100)


y_pred2 = clf2.predict(X_test)
mae2 = mean_absolute_error(y_pred2, y_test)
print('MAE:', 1 - mae2)

MAE: 0.9067788485348552


random forest

In [349]:
#random forest

X_train, X_test, y_train, y_test = train_test_split(train_X_Title, y1, test_size=0.20, random_state=2)

clf1 = RandomForestRegressor(n_estimators =10)
clf1.fit(X_train, y_train)
    
y_pred1 = clf1.predict(X_test)
mae1 = mean_absolute_error(y_pred1, y_test)
print('MAE:', 1 - mae1)

MAE: 0.9219321112023146


In [350]:
#random forest

X_train, X_test, y_train, y_test = train_test_split(train_X_Headline, y2, test_size=0.20, random_state=2)

clf2 = RandomForestRegressor(n_estimators =10)
clf2.fit(X_train, y_train)

y_pred2 = clf2.predict(X_test)
mae2 = mean_absolute_error(y_pred2, y_test)
print('MAE:', 1 - mae2)

MAE: 0.9072569868853468


In [418]:
#using LinearSVR for prediction as it gave the best results


title = clf1.predict(test_X_Title)
headline = clf2.predict(test_X_Headline)

In [419]:
df = pd.DataFrame()
df['IDLink'] = test_id
df['SentimentTitle'] = title
df['SentimentHeadline'] = headline
df.to_csv('submit_1_2.csv', index=False)