In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random

import ast
import matplotlib.pyplot as plt
import plotly.offline as py
py.init_notebook_mode(connected=True)

from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from wordcloud import WordCloud
import plotly.graph_objs as go

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

%matplotlib inline
# Any results you write to the current directory are saved as output.

**Prepare for the data**

Get two datasets seperately.

In [None]:
# get the data from kaggle
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()
print('Done!')

In [None]:
# two training data sets, market and news
(market_train_df, news_train_df) = env.get_training_data()

Print some first several observations of the two data sets.

In [None]:
market_train_df.shape

In [None]:
market_train_df.head()

There are 4072956 rows and 16 columns in the market data.

In [None]:
news_train_df.shape

In [None]:
news_train_df.head()

There are 9328750 rows and 35 columns in the news data.

**Pick up needed observations from market data.**

There are too many observations for different companies and different years. We'd like to pick 100 different companies in different years.

* *Deal with missing values in market data.*

Firstly, there exist some missing values in market data, such as returnsClosePrevMktres1, returnsOpenPrevMktres1,  returnsOpenPrevMktres1. These variables have the effect on the response. 

In [None]:
#  check out the missing values in market data
miss_market = pd.isnull(market_train_df)
miss_market.sum(axis=0)

From above, we could easily get that the four columns have the exact missing values, for example, the returnsClosePrevMktres1 has 15980 missing value,  returnsOpenPrevMktres1 has 15988 missing values, returnsClosePrevMktres10 has 93010 missing values and
returnsOpenPrevMktres10 has 93054 missing values. And because we'd like to reduce the rows in market data in this project. So, for better viewing the contribuiton of each variables, we want to clean data by dropping all the rows which have the missing values in these four variables.

In [None]:
train_market = market_train_df.dropna()
train_market.shape

After dealing with missing values,  there are less rows in market data, which only contains 3979902 observations. Because our purpose is to analysis the market trend in next days. So, it has a big connection with time.  So the next thing is to randomgly pick up 30 comapnies in diferent years.

* *Get the 30 comapnies in different years.*

Firstly, we randomly choose 100 companies based on their different assetCode.

In [None]:
assetCode = train_market.loc[:,'assetCode']
np.random.seed(1234)
sample_asset = np.random.choice(assetCode, 30, replace = False)
sample_asset

Above all, we choose 10 companies randomly without replacement. And then next thing we generate the data based on these 100 companies in different years.

In [None]:
train_market = train_market[train_market['assetCode'].isin(sample_asset)]
train_market.shape

In [None]:
# see the sample data
train_market.head()

Based on the result above, we get the 60383 observations with 30 companies in different year. 

* *Observe the target variable*

For better analysis, we get the quick look at the target variable. According the the proposal, the target variable would be the returnsOpenNextMktres10. 

In [None]:
data = []
for i in [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]:
    price_df = train_market.groupby('time')['returnsOpenNextMktres10'].quantile(i).reset_index()

    data.append(go.Scatter(
        x = price_df['time'].dt.strftime(date_format='%Y-%m-%d').values,
        y = price_df['returnsOpenNextMktres10'].values,
        name = f'{i} quantile'
    ))
layout = go.Layout(dict(title = "Trends of returnsOpenNextMktres10",
                  yaxis = dict(title = 'Price (USD)'),
                  ),legend=dict(
                orientation="h"),)
py.iplot(dict(data=data, layout=layout), filename='basic-line')

* *Choose the data after 2010*

Based on the result, we could easily get the Trends of returnsOpenNextMktres10. And based on the quantitles, we could roughly obtain the mean and the variance. Based on the result, it seems that the mean is roughly same whenever the time is, but the variance could be hugely different. For example,  almost in 2009, the variance seems larger than anyone else. 

Based on the phenomenon, we investigated what happened at that time so that the variance could be so huge like this.  We found that it was the Financial crisis that caused the variance.  And according to the investigation, we found that the reason of this special Financial crisis is related with the bursting of the US housing bubble, already-rising default rates on "subprime" and  increase of adjustable-rate mortgages (ARM). Multiple reasons jointly caused the Financial crisis. 

However, we'd like to use the news to predict the stock movement, thus, we may not take the Financial crisis into account. Therefore, we'd like to only consider this companies after 2010. Based on the picture above, the mean and the variance seems relatively flat, compared with before.

In [None]:
market_train_df = train_market.loc[market_train_df['time'] >= '2010-01-01 22:00:00+0000']
market_train_df.shape

Finally, we prepare the market data with 43242 rows and 16 columns, which includes the randomly picking up 30 companies from 2010. 

* *Observe the final data set and see the trend of the close price*

In [None]:
data = []
for asset in sample_asset:
    asset_df = market_train_df[(market_train_df['assetCode'] == asset)]
  
    data.append(go.Scatter(
        x = asset_df['time'].dt.strftime(date_format='%Y-%m-%d').values,
        y = asset_df['close'].values,
        name = asset
    ))
layout = go.Layout(dict(title = "Close price of 30 random assets",
                  yaxis = dict(title = 'Price (USD)'),
                  ),legend=dict(
                orientation="h"))
py.iplot(dict(data=data, layout=layout), filename='basic-line')

It seems that this sample is enough random,  some of them are dramatically changing but some of them are flat. Even some of the company has disappearded in certain years.  Based on the result, we would roughly say the dataset could be ensough to illustrate our problem for this specific question.


**Show the outcome**

For this specific question, we'd like to process the classification for the target variable "returnsOpenNextMktres10". Typically, we'd like to view stock's increase or decrease. So, if the outcome is positive, we'd like to consider this stock has been increasing over the 10 days, and vice versa.

In [None]:
market_train_df['indicator'] = market_train_df['returnsOpenNextMktres10'].map(lambda x: 1 if x > 0 else -1)
(market_train_df['indicator'].value_counts() / 43242).plot('bar')
plt.xticks(rotation=30)
plt.title('Classification of returnsOpenNextMktres10(target variable)')

Overall, the outcom has two class, either increase or decrease. And, obviously, "1" representing the increase has about 52% of the total observations, and "-1" representing the decrease has roughlt 48%. It is almost equal, for the targeted variables. Under this circumstance, doing classification seems reasonable.

**Deal with the news data in order to merge with the market data**

Because we'd like to merge two data sets. And intutively, we think that the news before could have an impact on the stock movement. But in which way the news before influence the stock movement could be very important. So, maybe the recent days published news could differently influence our stock movement. Under this circumstance, firstly we should consider :

* choosing how many days of news we should use to predict
* After choosing days of news, we try to create a new observation which includes all the infomation about recent days' information.

Extract the news data that contains the 30 assetCodes.


In [None]:
news_train_df['indicator'] = news_train_df['assetCodes'].map(lambda x: 1 if any(elem in ast.literal_eval(x) for elem in sample_asset) else 0)
news_train_df = news_train_df[news_train_df['indicator'] == 1]
news_train_df.shape

We choose the news data as long as the 'assetCodes' contains any one assetCodes in the sample we picker up before. So, following this criteria, we finally get the 203840 rows and 36 columns of the news data, which is much more smaller than the overall data size in original news data.

* *Explore the vairables*

Firstly, we 'd like to explore the variables in news data and find some correlations between variables. 

1. *Headline*

Each news have their unique headline, which could vary from days to days, news provider to news provider.  Hence, the headline is very difficult to view its influence on our target variable. We'd like to consider another straight and simple way to judge whether it has an impact on our stock movement. Based on the pictures above, we could easily get that stock movements of companies exist large difference in year 2013 - 2014,  for example, the campany named CELG.O has dramatically increasing, however, PMCS.O seems no change. We'd like to compare the headline of the two companies in this period, to see whether the wordcloud of the two companies has difference.  If they indeed are different, we could think that the headline influences the stock movements with some cofidence.

Firstly, we choose all the assetCodes that includes the CLEG.0 and based on this selection, we make the wordcloud.  In this way,  we could get a quick glance at the important words in headline corresponding with the news mentioned CLEG.O.

And, also, do the same thing for the PMCS.O. Then, we'd like to compare the difference between the top words, for better deciding whethter the headline could influence the target variable

In [None]:
# pick up the assetCodes that includes CLEG.O
news_train_df['CELG_indicator'] = news_train_df['assetCodes'].map(lambda x: 1 if x.find('CELG.O') > 0 else 0)
news_CELG = news_train_df[news_train_df['CELG_indicator'] == 1]



# make the wordcloud
text = ' '.join(news_CELG['headline'].str.lower().values[-1000000:])
wordcloud = WordCloud(max_font_size=None, stopwords=stop, background_color='white',
                      width=1200, height=1000).generate(text)
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud)
plt.title('Top words in headline of Company CLEG.O')
plt.axis("off")
plt.show()

In [None]:
news_CELG.head()

In [None]:
news_CELG['assetName'].unique()


According to the wordcloud, we found that the mostly mentioned is the assetName of the paricular assetCode.

In [None]:
# pick up the assetCodes that includes PMCS.O
news_train_df['PMCS_indicator'] = news_train_df['assetCodes'].map(lambda x: 1 if x.find('PMCS.O') > 0 else 0)
news_PMCS = news_train_df[news_train_df['PMCS_indicator'] == 1]

# make the word cloud
text = ' '.join(news_PMCS['headline'].str.lower().values[-1000000:])
wordcloud = WordCloud(max_font_size=None, stopwords=stop, background_color='white',
                      width=1200, height=1000).generate(text)
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud)
plt.title('Top words in headline of Company PMCS.O')
plt.axis("off")
plt.show()

In [None]:
news_PMCS['assetName'].unique()

Also, the mostly mentioned in this wordcloud is the assetName for the specific assetCode.

So, we find that the headline connects closelg with the assetCode, and the two wordclouds significanly differ from each other. We would conclude that the headline indeed have the effect on the stock movements.

2.  *urgency*

Uegency represents differentiates story types, "1" represents alert, "3" represents articles. We want to double check if there exist any other values and the amount of alert and articles.


In [None]:
(news_train_df['urgency'].value_counts() / 1000000).plot('bar');
plt.xticks(rotation=30);
plt.title('Urgency counts (mln)');

Based on the analysis above, there only exist "1" and "3" in data, and also the accounts for "3" seems double the accounts of "1".  And this variables can be seen as the weight of the news importance.

3.  *takeSequence*

This special variablethe means take sequence number of the news item, starting at 1. For a given story, alerts and articles have separate sequences. As analysis above, we know the news item is divided into 2 types, articles and alert. For this variables, we want to specifically get the idea of what the sequence of different types varies.

Firstly, we check when the news type is alert.

In [None]:
news_alert = news_train_df[news_train_df['urgency'] == 1]
(news_alert['takeSequence'].value_counts()[:10] / len(news_alert['takeSequence'])).plot('bar');
plt.xticks(rotation=30);
plt.title('takeSequence counts if type is an alert');
news_alert['takeSequence'].unique()

According to the result, if the news is an alert, there would be different sequence for the news item mentioned. Actually, the number is from 1 to 94. To have the stright view, we just plot the most frequenct sequences. And it seems that the news item were mentioned as early as possible. 

And then we check if the type of news is article, the sequence would be different or not.

In [None]:
news_article = news_train_df[news_train_df['urgency'] == 3]
(news_article['takeSequence'].value_counts()[:10]/ len(news_article['takeSequence'])).plot('bar');
plt.xticks(rotation=30);
plt.title('takeSequence counts if type is an article')
news_article['takeSequence'].unique()

According to the result, if the item is an article, the seqeunce for the item mentioned is different from 1 to 95 but less variety campred with the sequence in alert. And also, it seems clearly that if the item is an article, the takeSequencce "1" is much larger than others. So we may just have a simple assumption. If the news type is an article, we just think the taksSequence is "1".

4. *headlineTag*

We have already seen the influence of the headline. And to be specifically, we want to find the headline tag's difference and wonder to know which headline appears much more frequent than anyone else.

In [None]:
(news_train_df['headlineTag'].value_counts() / 1000)[:10].plot('barh');
plt.title('headlineTag counts (thousands)');

Based on the result, it seems that lots of the news don't have the specific headline Tag. And it seems that the "BRIET" has the most frequent mentioned in the headline Tag. The headline Tag is abstracted from the healine, so no doubt that it has a strong connection with healine. For our purpose, to avoid the collinearity, we may remove this variable in our model.



**5. provider**

Now, we look at variable 'provide'.  First we count the number for each provider. Then I do a histogram and wordscloud to show my result.

In [None]:
news_train_df['provider'].value_counts()[:12].plot(kind='barh')
plt.xticks(rotation=90)
plt.title('Provider Count')

In [None]:
text = ' '.join(news_train_df['provider'])
wordcloud = WordCloud(collocations=False).generate(text)
plt.figure(figsize=(12, 3))
plt.imshow(wordcloud)
plt.title('Top words in provider')
plt.axis("off")
plt.show()

We can find that most of the privider is RTRS.

**6. body size**

Let's see body size. First, we look at the statistical summary. We find that the range is very large.

In [None]:
news_train_df['bodySize'].describe()

In [None]:
plt.boxplot(news_train_df['bodySize'][news_train_df['bodySize'] >0])

From thr boxplot, we find that there are a lot of outliers. So we may consider the different group of body size in the future analysis.

**word count**

We want to use sentimentWordCount divide by wordCount to get a new valiable sentiPercent, which should have different values, then we can say that the big value can give a large weight of sentiment.

In [None]:
news_train_df['sentiPercent'] = news_train_df['sentimentWordCount'] / news_train_df['wordCount']
plt.boxplot(news_train_df['sentiPercent'])

From the boxplot, we find that the mean value is almost 1, which means that the news has a lot of attribute.

**sentenceCount**

When we look at the sentenceCount, we use firstMensionSentence divide by sentenceCount to get a new variable firstMentionPercent, we can look at this varibale to find the important of news.

In [None]:
news_train_df['firstMentionPercent'] = news_train_df['firstMentionSentence'] / news_train_df['sentenceCount']
plt.boxplot(news_train_df['firstMentionPercent'])

We find that the mean is around 0.2， which is a relatively small number 


**sentimentClass**

In [None]:

(news_train_df['sentimentClass'].value_counts() / 1000000).plot('bar');
plt.xticks(rotation=30);
plt.title('sentimentCLass counts (mln)');
for i, j in zip([-1, 0, 1], ['negative', 'neutral', 'positive']):
    df_sentiment = news_train_df.loc[news_train_df['sentimentClass'] == i, 'assetName']
    print(f'Top mentioned companies for {j} sentiment are:')
    print(df_sentiment.value_counts().head(5))
    print('')

From the trend we could know, it seemlike that we could think the trend is towards more to negtive side. Among these company, Citigroup is most negative, Barclays is most neutral, and Barclays PLC is the most positive.

**marketCommentary**

First, we plot a histogram of different group 

In [None]:
(news_train_df['marketCommentary'].value_counts() / 1000000).plot('bar');
plt.xticks(rotation=30);
plt.title('marketCommentary counts (mln)');

From the histogram, we find that most of maketCommentary is at the same group, which value is False. So we can consider that the variable marketCommentary has no effect on our news importance. We can delect this variable.


Because we can ignore headlineTag, then we also can ignore headline. 
In conclusion, we can ignore variables: headlinetag, headline, marketCommentory, provider, 
Then we try to create two new new variables based on 'subject' and 'audience'.

**Merge data**

For this specific question, we want to use the kernal function to give the different weights for the previous news' information.

Previous news data may have different influence on the present stock movement. But based on the timeliness of the news, we wonder better evaluate the news' influence. To evaluate this possible effect of a news article in a specific assetCode, for the same assetCode at a given time, we define a time interval that we call the window of influence to represent the previous time. 

But the way we could define the weights of previous time into the same assetCode given a specific time  is, we using the **sliding windows** on time series data. As for the weight, because we only care about the time's effect on news data, then the Exponential Distribution should be the prior consideration.

As we know before, one place the exponential distribution arises is in the modeling of time or distance between occurrence of events. And its definiation is as follows:

The random variable X that equals the distance between successive events from a Poisson process with mean number of events λ > 0 per unit interval is an exponential random variable with parameter λ. The probability density function of X is : f(x) = λe^(−λx) for 0 ≤ x < ∞.


In [None]:
def generate():
    proba = random.random()
    max = 0
    for i in range(1,M+1):
         max += (1/2)**i
         if(proba<= max):
              return i-1
    return generate()
M = 10
res = [0 for i in range(M+1)]
for i in range(1000):
    res[generate()]+=1
plt.plot([r/1000 for r in res])

This exponential distribuition can clearly give us the straight kernal function. We give more weights to the closer days and little weights to the more far away days from present days.

Hence, we'd like to use the "sliding windows"  with exponential distribution as the weighting  for previous days before the given date, to create the new observations given the same assetCode and the date. And this new observation would definitely include all the previous information, which could give more accurate interpletation on the target variable based on the news information.

* Firstly, we create a function that aggregates the news in the same day by calculating the average.

In [None]:
asset_code_dict = {k: v for v, k in enumerate(market_train_df['assetCode'].unique())}#将code数值化，以字典形式存储
print(asset_code_dict)
drop_columns = [col for col in news_train_df.columns if col not in ['sourceTimestamp', 'urgency', 'takeSequence', 'bodySize', 'companyCount', 
               'sentenceCount', 'firstMentionSentence', 'relevance','firstCreated', 'assetCodes']]
columns_news = ['firstCreated','relevance','sentimentClass','sentimentNegative','sentimentNeutral',
               'sentimentPositive','noveltyCount12H','noveltyCount24H','noveltyCount3D', 'noveltyCount5D','noveltyCount7D',
                'volumeCounts12H','volumeCounts24H','volumeCounts3D', 'volumeCounts5D','volumeCounts7D','assetCodes',
                'sourceTimestamp','assetName','audiences', 'urgency', 'takeSequence', 'bodySize', 'companyCount', 
                 'sentenceCount','sentimentWordCount','wordCount','firstMentionSentence','time']

The above is just made the assetCodes into the numeric value, to better process the later analysis.

In [None]:
def news_today(news_df):
    news_df = news_df[columns_news]
    news_df['sourceTimestamp']= news_df.sourceTimestamp.dt.hour
    news_df['firstCreated'] = news_df.firstCreated.dt.date
    news_df['assetCodesLen'] = news_df['assetCodes'].map(lambda x: len(eval(x)))
    news_df['assetCodes'] = news_df['assetCodes'].map(lambda x: list(eval(x))[0])
    news_df['emotion_importance'] = news_df['sentimentWordCount']/news_df['wordCount']*100
    news_df['asset_count'] = news_df.groupby(['assetName', 'sentimentClass'])['time'].transform('count')
    news_df['len_audiences'] = news_train_df['audiences'].map(lambda x: len(eval(x)))
    kcol = ['firstCreated', 'assetCodes']
    news_df = news_df.groupby(kcol, as_index=False).mean()
    
    return news_df

In [None]:
news_train_day = news_today(news_train_df)
news_train_day.head()

In [None]:
news_train_day.shape

For this special case, we create some new columns to represent some other features.

*asset_count* represents the number of news for the assetCode mentioned at a given day. The higher the number is, means at a specific day, the assetCodes mentioned more frequently. 

*len_audiences* represents the number of audiences' type for a given assetCode in a special day. The largest numbers means the population of audience is bigger.

And the overall news data becomes 24376 rows and 30 columns after we remove some trival categorical variables and create new variables to capture more features. At the same time, all the news data variables are numerial data.

* Secondly, we want use sliding windows with exponential distribution as weights to calculate the former 7-days news. In this way, we could definitely view the whole influence of the news above with considering the timeliness of the news.

In [None]:
def exp_weighted_mean(assetCodes, day, df, lambd):
    from datetime import datetime, timedelta
    #day = datetime.strptime(day, '%Y-%m-%d').date()
    temp = df[(df['assetCodes'] == assetCodes) & (df['firstCreated'] <= day)]
    start_day = day - timedelta(days=6)
    
    if start_day in temp['firstCreated'].tolist():
        temp = temp[temp['firstCreated'] >= start_day]
    time = temp['firstCreated'].tolist()
    weight = [(time[i] - time[-1]).days for i in range(len(temp))]
    weight = np.exp(lambd * weight)
    del temp['firstCreated']
    del temp['assetCodes']
    temp = temp.multiply(weight, axis=0)
    temp = temp.sum() / np.sum(weight)
    temp['assetCodes'] = assetCodes
    temp['date'] = day
    return temp

In [None]:
news_train = news_train_day.apply(lambda row: exp_weighted_mean(row['assetCodes'], row['firstCreated'], news_train_day, 1), axis=1)

In [None]:
news_train.shape

* Thirdly, merge the news data and market data based on the sams assetCode for a given time.

In [None]:
def data_prep(market_df,news_df):
    #自定义一些有用的特征，变换数据的格式
    market_df['date'] = market_df['time'].map(lambda x: x.date())
    market_df['close_to_open'] = market_df['close'] / market_df['open']
    market_df.drop(['time'], axis=1, inplace=True)
    
    market_df = pd.merge(market_df, news_df, how='left', left_on=['date', 'assetCode'], 
                            right_on=['date', 'assetCodes'])
    del news_df
    #market_df['assetCodeT'] = market_df['assetCode'].map(asset_code_dict)
    market_df = market_df.drop(columns = ['assetCodes','assetName']).fillna(0) 
    return market_df

In [None]:
print('Merging data ')
train_market = data_prep(market_train_df, news_train)
train_market.head()

In [None]:
market_train_df.head()

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA

In [None]:
y=train_market['indicator']
X=train_market.loc[:, train_market.columns != 'returnsOpenNextMktres10']
X=X.loc[:,X.columns !='indicator']
X=X.loc[:,X.columns !='date']
X_origin=X.loc[:,X.columns !='assetCode']
X.head()
X=scale(X_origin)
clf = QDA()
clf.fit(X, y)
y_hat=clf.predict(X)
print(accuracy_score(y,y_hat))
print(confusion_matrix(y,y_hat))

In [None]:
clf_svc_cv = QDA()
scores_clf_svc_cv = cross_val_score(clf_svc_cv,X,y,cv=20)
print(scores_clf_svc_cv)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_clf_svc_cv.mean(), scores_clf_svc_cv.std() * 2))

In [None]:
columns_market =['volume','close','open','returnsClosePrevRaw1','returnsOpenPrevRaw1',
            'returnsClosePrevMktres1','returnsOpenPrevMktres1','returnsClosePrevRaw10',
           'returnsOpenPrevRaw10','returnsClosePrevMktres10','returnsOpenPrevMktres10',
            'universe','close_to_open','relevance','sentimentClass','sentimentNegative',
         'sentimentNeutral','sentimentPositive','noveltyCount12H','noveltyCount24H']
X_pca=X_origin[columns_market]
X_pca=scale(X_pca)
clf_pca = QDA()
clf_pca.fit(X_pca, y)
y_hat=clf_pca.predict(X_pca)
print(accuracy_score(y,y_hat))
print(confusion_matrix(y,y_hat))

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf0 = RandomForestClassifier()
rf0.fit(X,y)
y_hat1=rf0.predict(X)
print(accuracy_score(y,y_hat1))
print(confusion_matrix(y,y_hat1))
importances=rf0.feature_importances_
indices=np.argsort(importances)[::-1]
for f in range(X.shape[1]):
    print("%2d) %-*s %f"%(f+1,30,X_origin.columns[f],importances[indices[f]]))

In [None]:
plt.title('Feature Importance ')
plt.barh(range(X_origin.shape[1]),importances[indices],height=0.8,color='lightblue',align='center')
plt.yticks(range(X_origin.shape[1]),X_origin.columns)
plt.ylim([-1,X_origin.shape[1]])
plt.tight_layout()
plt.show()


In [None]:
pca = PCA(n_components=20)
pca.fit(X)
print(pca.explained_variance_ratio_)
indices=np.argsort(pca.explained_variance_ratio_)[::-1]
for f in range(20):
    print("%2d) %-*s %f"%(f+1,30,X_origin.columns[f],pca.explained_variance_ratio_[indices[f]]))