In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Misc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

pd.set_option('display.max_columns', None)

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000

In [None]:
train = pd.read_csv('/kaggle/input/zs-challenge-find-sentiment-of-news/train_file.csv')
test = pd.read_csv('/kaggle/input/zs-challenge-find-sentiment-of-news/test_file.csv')
sample = pd.read_csv('/kaggle/input/zs-challenge-find-sentiment-of-news/sample_submission.csv')

In [None]:
import pandas_profiling as pp

prof = pp.ProfileReport(train, title="Pandas Profiling Report")
prof

In [None]:
pip install hiplot

In [None]:
import hiplot as hip
data = train.drop(['IDLink', 'Facebook', 'GooglePlus','LinkedIn'], axis = 1).to_dict(orient = 'records')
hip.Experiment.from_iterable(data).display()

As per above Graph, Ranking of News on different Social Networks have no effect on Sentiment of the headline or Title


In [None]:
train.columns

In [None]:
train['Topic'].value_counts()

In [None]:
train.describe()

In [None]:
train.head()

In [None]:
from functools import wraps
import datetime as dt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"just ran step {func.__name__} shape={result.shape} took {time_taken}s")
        return result
    return wrapper

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()


@log_step
def find_sentiment_nltk(data):
    
    from nltk.sentiment import SentimentIntensityAnalyzer
    sia = SentimentIntensityAnalyzer()
    
    y = data['Headline'].apply(sia.polarity_scores)
    data['sentihead'] = y.apply(pd.Series)['compound']
    
    z =  data['Title'].apply(sia.polarity_scores)
    data['sentititle'] = z.apply(pd.Series)['compound']
    
    return data




In [None]:
from sklearn.preprocessing import LabelEncoder

@log_step
def encoding(data):

    """
    One Hot Encoding and Label Encoding 
    
    
    le = LabelEncoder()
    data['Source'] = le.fit_transform(data['Source'])

    var_mod = ['Topic']

    for i in var_mod:
        data[i] = le.fit_transform(data[i])
        
    """
    
    le = LabelEncoder()
    data['Source'] = le.fit_transform(data['Source'])

    var_mod = ['Topic']

    for i in var_mod:
        data[i] = le.fit_transform(data[i])

    # One Hot Encoding : 
    # data = pd.get_dummies(data, columns = ['Topic', 'Source'])
    
    return data
    

In [None]:
@log_step
def impute(data):
    
    data['Source'] = data['Source'].fillna('Empty')
    return data



In [None]:
@log_step
def start_pipeline(dataf):
    return dataf.copy() 

In [None]:
train['PublishDate'] = pd.to_datetime(train['PublishDate'])
train['PublishDate-Month'] = train['PublishDate'].dt.month
train['PublishDate-Year'] = train['PublishDate'].dt.year
train['PublishDate-Day'] = train['PublishDate'].dt.day

In [None]:
train.drop(['IDLink', 'PublishDate'], axis = 1, inplace = True)

In [None]:
train_df = (train
      .pipe(start_pipeline)
      .pipe(impute)
      .pipe(find_sentiment_nltk)
      .pipe(encoding))

In [None]:
X= train_df.drop(columns = ['SentimentTitle', 'SentimentHeadline'], axis=1)
y= train_df[['SentimentTitle','SentimentHeadline']]

In [None]:

from sklearn.model_selection import train_test_split

X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
clf = RandomForestRegressor(n_estimators=500, n_jobs=-1)
clf.fit(X_train, y_train)


In [None]:
y_pred = clf.predict(X_valid)

In [None]:
y_pred

In [None]:
y_valid

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
cat_columns = []

for col in train_df.select_dtypes('object').columns:
    print(col)
    cat_columns.append(col)
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])

In [None]:
cat_features_index = [i for i, col in enumerate(train_df.columns) if col in cat_columns]



In [None]:
NUM_OF_BOOST_ROUND = 10000
EARLY_STOPPING = 300

In [None]:
params = {
    'cat_features': cat_features_index,
    'eval_metric': 'MAE',
    'random_seed': 2021,
    'n_estimators' : NUM_OF_BOOST_ROUND
}

In [None]:
from catboost import CatBoostRegressor

In [None]:
bst = CatBoostRegressor(**params, early_stopping_rounds = EARLY_STOPPING)
_ = bst.fit(X_train , y_train, eval_set = (X_valid, y_valid), plot = True, verbose = False)

In [None]:
bst = CatBoostRegressor()
bst.fit(X_train, y_train)