In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import yfinance as yf
import pandas_datareader as pdr
from pandas.tseries.offsets import DateOffset

import re

pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


## Clinical Trial Dataset: 

In [3]:
#get the Clinical Trial for COVID Data
covid = pd.read_csv("20210412020005_covid-19.tsv" ,sep='\t')

industry=covid[covid['funded_bys'].str.find('Industry')>-1]

industry_covid=pd.melt(industry, id_vars=['nct_id', 'other_ids', 'status','why_stopped', 'funded_bys', 'sponsor_collaborators','study_type', 'phases', 'conditions', 'keywords',
       'interventions', 'intervention_details', 'arm_details', 'outcome_measures', 'start_date','primary_completion_date', 'completion_date', 'first_posted',
       'results_first_posted', 'last_update_posted', 'nlm_download_date','study_first_submitted_date', 'has_expanded_access',
       'is_fda_regulated_drug', 'is_fda_regulated_device','is_unapproved_device', 'locations', 'number_of_facilities',
       'has_us_facility', 'has_single_facility'], value_vars=['lead_sponsor', 'collaborators'], value_name='company')
#seperate them 
industry_covid['explode']=industry_covid.company.str.replace(',', '').str.split('|')
#filter out the industry
industry_covid=industry_covid.explode(column='explode')
industry_covid=industry_covid[industry_covid['explode'].str.find('Industry')>-1]
industry_covid['industry_name']=industry_covid['explode'].str.extract(r'(.*)\[Industry\]')

industry_covid['filter_name']=industry_covid['industry_name'].str.lower().str.replace(' ag', '')
industry_covid['filter_name']=industry_covid['filter_name'].str.replace('.', '').str.replace(' inc', '',)
industry_covid['filter_name']=industry_covid['filter_name'].str.replace(' holdings', '')
industry_covid['filter_name']=industry_covid['filter_name'].str.replace(' llc', '')
industry_covid['filter_name']=industry_covid['filter_name'].str.replace(' corporation', '').str.replace(' corp', '').str.replace(' co\.', '')
industry_covid['filter_name']=industry_covid['filter_name'].str.replace(' se', '').str.replace(' ltd', '')

industry_covid=industry_covid.reset_index()
industry_covid['merge_name']=industry_covid['filter_name'].astype(str).str.extract(r'(^[\d\w\-\&\.]* ?[\w\-\&]{1,4})')

  industry_covid['filter_name']=industry_covid['filter_name'].str.replace('.', '').str.replace(' inc', '',)
  industry_covid['filter_name']=industry_covid['filter_name'].str.replace(' corporation', '').str.replace(' corp', '').str.replace(' co\.', '')


## Nasdaq Dataset:

In [4]:
#get the data
nasdaq=pd.read_csv('nasdaq_stocks.csv')

sectors=['Health Care']
health_stocks=nasdaq[nasdaq.Sector.isin(sectors)]

#remove any stocks that has no market cap
health_stocks=health_stocks[health_stocks['Market Cap']>1]

#get stock name of the company ( first word and two letter of the second word)

health_stocks['merge_name']=health_stocks['Name'].str.lower().str.replace(' american depositary shares', '')
health_stocks['merge_name']=health_stocks['merge_name'].str.replace('common stock', '')
health_stocks['merge_name']=health_stocks['merge_name'].str.replace(' inc.', '')\
                                    .str.replace('corporation', '').str.replace('corp', '').str.replace('se ', '').str.replace('.', '')
health_stocks['merge_name']=health_stocks.merge_name.str.extract(r'(^[\w\-\&\.]* ?[\w\-\&]{1,4})')

health_stocks.rename(columns={'Name':'Stock_Name', 'Symbol':'Stock_Symbol'}, inplace=True)
health_stocks=health_stocks.reset_index()

#count words that were extracted
industry_covid['merge_name']=industry_covid['merge_name'].str.rstrip(' ')
industry_covid['number_words']=industry_covid['merge_name'].str.count(' ')+1

health_stocks['merge_name']=health_stocks['merge_name'].str.rstrip(' ')
health_stocks['number_words']=health_stocks['merge_name'].str.count(' ')+1

#break up df with the number of words
industry_1=industry_covid[industry_covid['number_words']==1]
industry_2=industry_covid[industry_covid['number_words']==2]

stocks_1=health_stocks[health_stocks['number_words']==1]
stocks_2=health_stocks[health_stocks['number_words']==2]

  health_stocks['merge_name']=health_stocks['merge_name'].str.replace(' inc.', '')\
  health_stocks['merge_name']=health_stocks['merge_name'].str.replace(' inc.', '')\


## Merging Clinical Trials and Nasdaq Datasets: 

### Merge: Step1 and Step 2

In [5]:
#Merge the names of the stocks and clinical trials using two words
outer2_2=industry_2.merge(stocks_2, on='merge_name', how='outer')
inner2_2=industry_2.merge(stocks_2, on='merge_name', how='inner')

#Merge the names of the stocks and clinical trials using one words
outer1_1=industry_1.merge(stocks_1, on='merge_name', how='outer')
inner1_1=industry_1.merge(stocks_1, on='merge_name', how='inner')

### Merge: Step3 and Step 4

In [6]:
col_industry=['nct_id', 'other_ids', 'status', 'why_stopped', 'funded_bys',
       'sponsor_collaborators','study_type',
       'phases', 'conditions', 'keywords', 'interventions',
       'intervention_details', 'arm_details', 'outcome_measures', 'start_date',
       'primary_completion_date', 'completion_date', 'first_posted',
       'results_first_posted', 'last_update_posted', 'nlm_download_date',
       'study_first_submitted_date', 'has_expanded_access',
       'is_fda_regulated_drug', 'is_fda_regulated_device',
       'is_unapproved_device', 'locations', 'number_of_facilities',
       'has_us_facility', 'has_single_facility', 'company', 'explode', 'merge_name']
col_stocks=['merge_name','Stock_Symbol', 'Stock_Name', 'Last Sale',
       'Net Change', '% Change', 'Market Cap', 'Country', 'IPO Year', 'Volume',
       'Sector', 'Industry']


#industries witout corresponding stocks
industry_2_wo_stocks=outer2_2[outer2_2.Stock_Symbol.isna()][col_industry]
industry_1_wo_stock=outer1_1[outer1_1['Stock_Name'].isna()][col_industry]

#stocks without corresponding industries
stocks_2_wo_industry=outer2_2[outer2_2.company.isna()][col_stocks]
stocks_1_wo_industry=outer1_1[outer1_1['company'].isna()][col_stocks]

#Extracting one-word names from the unmerged two word name companies. 
stocks_2_wo_industry['word']=stocks_2_wo_industry.merge_name.str.extract(r'(^[\w\-\&]*)\s')
industry_2_wo_stocks['word']=industry_2_wo_stocks.merge_name.str.extract(r'(^[\w\-\&]*)\s')

#Prepare columns for merge
industry_1_wo_stock['word']=industry_1_wo_stock['merge_name']
stocks_1_wo_industry['word']=stocks_1_wo_industry['merge_name']

#Actual Step3 and Step 4 merge
industry1_stocks_2=industry_1_wo_stock.merge(stocks_2_wo_industry, on='word', how='inner')
industry2_stocks1=industry_2_wo_stocks.merge(stocks_1_wo_industry, on='word', how='inner')
industry2_stocks1=industry2_stocks1[industry2_stocks1.Stock_Symbol!='NATR']

### Merge: Step5

In [7]:
#Need to add in Johnson & Jonhson, it's under the name Janssen
#use one word to merge Jassen to Johnson Johnson
johnson_and_johnson=industry_2[industry_2.merge_name.str.find('janssen')>-1]
johnson_and_johnson['merge_name']=johnson_and_johnson['merge_name'].str.extract(r'(^\w*)\s')
johnson_and_johnson[col_industry]
johnson_and_johnson=johnson_and_johnson.reset_index()

jnj_stock=health_stocks[health_stocks.Stock_Symbol=='JNJ']
jnj_df=jnj_stock.append(jnj_stock).append(jnj_stock).append(jnj_stock).append(jnj_stock).append(jnj_stock).append(jnj_stock).append(jnj_stock).append(jnj_stock).append(jnj_stock).append(jnj_stock)
jnj_df=jnj_df[['Stock_Symbol', 'Stock_Name', 'Last Sale', 'Net Change',
       '% Change', 'Market Cap', 'Country', 'IPO Year', 'Volume', 'Sector',
       'Industry', 'merge_name',]]

jnj_df=jnj_df.reset_index()
johnson_df=johnson_and_johnson.merge(jnj_df, left_index=True, right_index=True, how='outer')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  johnson_and_johnson['merge_name']=johnson_and_johnson['merge_name'].str.extract(r'(^\w*)\s')


### Merging all the Five of the Dataframes together

In [8]:
#Ensure all the columns match before stacking them 

col_final_merge=['nct_id', 'other_ids', 'status', 'why_stopped', 'funded_bys','study_type',
       'phases', 'conditions', 'keywords', 'interventions',
       'intervention_details', 'arm_details', 'outcome_measures', 'start_date',
       'primary_completion_date', 'completion_date', 'first_posted',
       'results_first_posted', 'last_update_posted', 'nlm_download_date',
       'study_first_submitted_date', 'has_expanded_access',
       'is_fda_regulated_drug', 'is_fda_regulated_device',
       'is_unapproved_device', 'locations', 'number_of_facilities',
       'has_us_facility', 'has_single_facility', 'explode', 'merge_name', 'Stock_Symbol', 'Stock_Name', 'Last Sale',
       'Net Change', '% Change', 'Market Cap', 'Country', 'IPO Year', 'Volume',
       'Sector', 'Industry']

industry1_stocks_2.rename(columns={'merge_name_x':'merge_name'}, inplace=True)
industry2_stocks1.rename(columns={'merge_name_x':'merge_name'}, inplace=True)

industry1_stocks_2=industry1_stocks_2[col_final_merge]
industry2_stocks1=industry2_stocks1[col_final_merge]

inner2_2=inner2_2[col_final_merge]
inner1_1=inner1_1[col_final_merge]

clinicalstocks=pd.concat([industry1_stocks_2, inner2_2, inner1_1, industry2_stocks1,johnson_df])

#converting all the date strings to datatime
clinicalstocks['start_date']=pd.to_datetime(clinicalstocks['start_date'])
clinicalstocks['primary_completion_date']=pd.to_datetime(clinicalstocks['primary_completion_date'])
clinicalstocks['completion_date']=pd.to_datetime(clinicalstocks['completion_date'])
clinicalstocks['first_posted']=pd.to_datetime(clinicalstocks['first_posted'])
clinicalstocks['results_first_posted']=pd.to_datetime(clinicalstocks['results_first_posted'])

In [9]:
df=clinicalstocks

In [14]:
df['symbol']=df['Stock_Symbol']