**Importing used libraries**

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pylab as plt
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler 
from sklearn import preprocessing
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from kaggle.competitions import twosigmanews
from itertools import chain
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import train_test_split


**Loading training dataset**

In [None]:
env = twosigmanews.make_env()
(market_df, news_df) = env.get_training_data()


**Define Global Variables**

In [None]:

lr = LinearRegression()
MLPR = MLPRegressor(hidden_layer_sizes=(50, ), activation= 'relu', solver='adam', alpha=0.0001, batch_size='auto')

**Starting Data exploration and Analysis**

*EDA on market data*

In [None]:
def merge(market_df, news_df,istrain):
    
    sc = StandardScaler()
    pca = PCA()
    max_abs_scale = preprocessing.MaxAbsScaler()
    
    
    #*Change date to more simple way to use*
    market_df['time']= pd.to_datetime(market_df['time'], format = '%Y%m%d', utc = None)
    market_df['time'] = (market_df['time'] - np.timedelta64(22,'h')).dt.ceil('1D')
    
    #*Dropping down the data rows that has zero value as universe*
    
    if 'universe' in market_df.columns:
        market_df = market_df[market_df['universe']==1]
        market_df = market_df.drop(['universe'], axis =1)
    
    #*Dropping the rows that have null values inside*
    
    market_df = market_df.fillna(market_df.mode().iloc[0])
       
    
    #Dropping unnecessary columns
    news_df = news_df.drop(['sourceTimestamp','firstCreated','sourceId','headline','headlineTag','assetName'],axis=1)
    
    #Date-time conversion,
    if istrain == True:
        news_df['time']= pd.to_datetime(news_df['time'], format = '%Y%m%d', utc = None)
        news_df['time'] = (news_df['time'] - np.timedelta64(22,'h')).dt.ceil('1D')
    else:
        news_df['time'] = market_df['time'][0]
        news_df['time']= pd.to_datetime(news_df['time'], format = '%Y%m%d', utc = None)
        news_df['time'] = (news_df['time'] - np.timedelta64(22,'h')).dt.ceil('1D')
        
        
        
    
    #Using PCA on volumes
    news_volume = news_df.loc[:,"noveltyCount12H":"volumeCounts7D"]
    news_volume = sc.fit_transform(news_volume)
    news_volume = pca.fit_transform(news_volume)
    explained_variance = pca.explained_variance_ratio_
    
    pca = PCA(n_components = 3)
    news_volume =pca.fit_transform(news_volume)
    
    news_df = news_df.drop(['noveltyCount12H','noveltyCount24H','noveltyCount3D','noveltyCount5D','noveltyCount7D','volumeCounts12H','volumeCounts24H','volumeCounts3D','volumeCounts5D','volumeCounts7D'], axis =1)
    
    news_volume_df = pd.DataFrame(news_volume,columns = ['PCA1','PCA2','PCA3'])
    
    news_df['vol1'] = news_volume_df['PCA1']
    news_df['vol2'] = news_volume_df['PCA2']
    news_df['vol3'] = news_volume_df['PCA3']
    
    #using LDA to convert sentiment data
    news_sentiment_X = news_df.loc[:,"sentimentNegative":"sentimentPositive"].values
    news_sentiment_Y = news_df.loc[:,"sentimentClass"].values
    news_sentiment_X = sc.fit_transform(news_sentiment_X)
    
    lda = LinearDiscriminantAnalysis(n_components = 1)
    news_sentiment = lda.fit_transform(news_sentiment_X,news_sentiment_Y)
    news_sentiment = max_abs_scale.fit_transform(news_sentiment)
    
    news_sentiment_df = pd.DataFrame(news_sentiment,columns = ['sentiment'])
    news_df['sentiment'] = news_sentiment
    
    news_df = news_df.drop(['sentimentNegative','sentimentNeutral','sentimentPositive','sentimentClass'], axis = 1)
    
    #Define sentiment proportion by using word counts
    sentiment_proportion = news_df['sentimentWordCount'].values / news_df['wordCount'].values
    news_df['sentiment_prop'] = sentiment_proportion
    news_df = news_df.drop(['sentimentWordCount','wordCount'], axis = 1)
    
    #Define sentence proportion by using sentence counts
    sentence_proportion = news_df['firstMentionSentence'].values / news_df['sentenceCount'].values
    news_df = news_df.drop(['firstMentionSentence','sentenceCount'], axis = 1)
    news_df['sentence_prop'] = sentence_proportion
    
    #Using subjects and audiences as their lenghts
    news_df['subjects']=news_df.subjects.apply(len)
    news_df['audiences']=news_df.audiences.apply(len)
    
    #Seperating asset codes  -----  MAKE NEWS TABLE MERGABLE
    news_df['assetCodes'] = news_df['assetCodes'].str.findall(f"'([\w\./]+)'")
    
    assetCodes_expanded = list(chain(*news_df['assetCodes']))
    assetCodes_index = news_df.index.repeat( news_df['assetCodes'].apply(len) )

    assert len(assetCodes_index) == len(assetCodes_expanded)
    df_assetCodes = pd.DataFrame({'level_0': assetCodes_index, 'assetCode': assetCodes_expanded})
    
    news_train_df_expanded = pd.merge(df_assetCodes, news_df,left_on='level_0', right_index=True, suffixes=(['','_old']))
    
    #DROP UNNECESSARY TABLES AFTER MERGING
    del news_df
    del df_assetCodes
    del assetCodes_index
    del assetCodes_expanded
    
    #DROP UNNECESSARY COLUMNS
    news_train_df_expanded = news_train_df_expanded.drop(['level_0','assetCodes'], axis = 1)
    if istrain == False:
        news_train_df_expanded = news_train_df_expanded.drop_duplicates(subset = 'assetCode') 
    
    #MERGE TWO TABLES
    X_df =  news_train_df_expanded.merge(market_df, how = 'right', on = ['time', 'assetCode'])
    
    if 'returnsOpenNextMktres10' in X_df.columns:
        Y_df = X_df['returnsOpenNextMktres10']
        X_df = X_df.drop('returnsOpenNextMktres10', axis = 1)
    else:
        Y_df = 0

    del news_train_df_expanded
    del market_df
    
    X_df = X_df.drop(['assetCode','time','provider','marketCommentary','assetName'], axis = 1)
    X_df.fillna(0,inplace=True)
    
    X_df = X_df.sort_index(axis = 1)
    
    return X_df, Y_df

    
    

In [None]:
X_df, Y_df = merge(market_df, news_df,True)
days = env.get_prediction_days()

In [None]:
"""#****VALIDATION SET DATA******
    X_train, X_valid, y_train, y_valid = train_test_split(X_df, Y_df, test_size=0.2, random_state=0)

    del X_df
    del Y_df"""

TRAIN DATA

In [None]:
MLPR.fit(X_df, Y_df)

In [None]:
for (market_test_df, news_test_df, predictions_template_df) in days:
    X_test_df, Y_test_df = merge(market_test_df, news_test_df,False)
    predictions_template_df['confidenceValue'] = MLPR.predict(X_test_df)
    env.predict(predictions_template_df)
print('Done')

In [None]:
env.write_submission_file()

In [None]:
import os
print([filename for filename in os.listdir('.') if '.csv' in filename])