# Pandas3, Pandas4

# Correlation and Covariance

study link:https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/

In [1]:
import pandas as pd
import numpy as np
import pandas_datareader.data as web

In [2]:
# dictionary comprehension

all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

ConnectionError: HTTPSConnectionPool(host='finance.yahoo.com', port=443): Max retries exceeded with url: /quote/AAPL/history?period1=1458169200&period2=1615935599&interval=1d&frequency=1d&filter=history (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f0e07fdf5b0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))

In [None]:
price = pd.DataFrame({ticker: data['Adj Close'] 
                      for (ticker, data) in all_data.items()})

In [None]:
price

In [None]:
volume = pd.DataFrame({ticker: data['Volume'] for (ticker, data) in all_data.items()})

In [None]:
volume.name = 'Volume'
volume.head()

In [None]:
returns = price.pct_change()

In [None]:
returns.tail()

In [None]:
returns['MSFT'].corr(returns['IBM'])

In [None]:
returns['MSFT'].cov(returns['IBM'])

In [None]:
returns.corr()

In [None]:
returns.corrwith(returns.IBM)

## Pandas 4

## Chapter 7
#### 7.1 : Handling Missing Data
#### 7.2 : Data Transformation
#### 7.3 : String Manipulation

### 7.1 : Handling Missing Data

- Filtering Out Missing Data
> 1. isnull
> 2. notnull
> 3. dropna
- Filling In Missing Data
> 1. fillna


#### Filtering Out Missing Data:

In [None]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, None])
string_data

In [None]:
string_data.isnull()

In [None]:
string_data[string_data.isnull()]

In [None]:
string_data[string_data.notnull()]

In [None]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [None]:
data

In [None]:
test = data.dropna()
test

In [None]:
data.dropna(inplace=True)

In [None]:
data

In [None]:
data = pd.DataFrame([[1., 6.5, 3.],
                     [1., np.nan, np.nan],
                     [np.nan, np.nan, np.nan], 
                     [np.nan, 6.5, 3.]])

In [None]:
data

In [None]:
data.dropna()

In [None]:
data.dropna(axis=1)

In [None]:
data.dropna(axis=0, how="all")

#### Filling In Missing Data

In [None]:
df = pd.DataFrame(np.random.randn(7, 3))
df

In [None]:
df.iloc[:4, 1] = np.nan

In [None]:
df.iloc[:2, 2] = np.nan

In [None]:
df

In [None]:
df.fillna(0)

In [None]:
df.fillna({1: 0.5, 2: 0.7})

### 7.2 : Data Transformation

- Removing Duplicated
> 1. duplicated
> 2. drop_duplicated
- Transforming Data Using a Function or Mapping
> 1. map
- Replacing Values
> 1. replace
- Renaming Axis Indexes
> 1. rename
- Discretization And Binning
> 1. cut
- Detecting And Filtering Outliers

#### Removing Duplicated

In [None]:
['one', 'two'] * 3 

In [None]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],'k2': [1, 1, 1, 3, 3, 4, 4]})

In [None]:
data

In [None]:
data.duplicated()

In [None]:
data[data.duplicated()]

In [None]:
data.drop_duplicates()

In [None]:
data['v1'] = range(7)

In [None]:
data

In [None]:
data.duplicated(['k1'])

In [None]:
data.drop_duplicates(['k1'])

In [None]:
data.drop_duplicates(['k1'], keep='last')

#### Transforming Data Using A Function Or Mapping

In [None]:
data = pd.DataFrame({'food': ['bacon', 'pulled', 'bacon','Pastrami', 'corned beef', 'Bacon','pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [None]:
data

In [None]:
meat_to_animal = {
'bacon': 'ppp',
'pulled': 'ppp',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'ppp',
'nova lox': 'salmon'
}

In [None]:
lowercased = data['food'].str.lower()

In [None]:
lowercased

In [None]:
data['animal1'] = lowercased.map(meat_to_animal)

In [None]:
data

In [None]:
data['animal']=data['food'].map(lambda x: meat_to_animal[x.lower()]) #one line code

In [None]:
data

#### Replacing Values

In [3]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [4]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [6]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [7]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [8]:
data.replace({-999: np.nan, -1000: 0}) # key==prev and value=next

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### Discretization and Binning

In [9]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [10]:
bins = [18, 25, 35, 60, 100]

In [11]:
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [12]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [13]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [14]:
pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

## Detecting and Filtering Outliers

In [62]:
data = pd.DataFrame(np.random.randn(1000, 4))

In [63]:
data.head()

Unnamed: 0,0,1,2,3
0,1.352202,-0.589229,-0.15655,0.597719
1,-0.652831,-0.211145,0.40164,-0.689107
2,-0.883552,0.801802,1.317318,0.501651
3,0.334485,-0.119986,0.347666,-0.179313
4,0.354423,-1.331511,-1.274266,-0.643439


In [64]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.033919,-0.000604,5.7e-05,-0.000293
std,0.970966,0.983082,1.001271,0.995195
min,-2.768406,-3.005857,-3.400656,-2.895652
25%,-0.643992,-0.728007,-0.667906,-0.643626
50%,0.021747,-0.016911,0.058546,-0.02055
75%,0.691657,0.663832,0.666703,0.642842
max,3.121891,3.059499,4.015471,3.226424


In [65]:
# mean
data.iloc[:, 0].sum() / 1000

0.03391859930570011

In [66]:
np.std(data.iloc[:, 0])

0.9704798989858939

In [67]:
min(data.iloc[:, 0])

-2.7684060633774914

In [69]:
# # 50%
# data.iloc[:, 0].sum() * 0.50

In [70]:
col1 = data[0]
col1.head()

0    1.352202
1   -0.652831
2   -0.883552
3    0.334485
4    0.354423
Name: 0, dtype: float64

In [71]:
col1[np.abs(col1) > 3]

777    3.121891
Name: 0, dtype: float64

In [72]:
np.sign(data)

Unnamed: 0,0,1,2,3
0,1.0,-1.0,-1.0,1.0
1,-1.0,-1.0,1.0,-1.0
2,-1.0,1.0,1.0,1.0
3,1.0,-1.0,1.0,-1.0
4,1.0,-1.0,-1.0,-1.0
...,...,...,...,...
995,1.0,1.0,1.0,1.0
996,1.0,1.0,1.0,1.0
997,-1.0,-1.0,-1.0,1.0
998,1.0,-1.0,-1.0,-1.0


In [73]:
np.abs(data)

Unnamed: 0,0,1,2,3
0,1.352202,0.589229,0.156550,0.597719
1,0.652831,0.211145,0.401640,0.689107
2,0.883552,0.801802,1.317318,0.501651
3,0.334485,0.119986,0.347666,0.179313
4,0.354423,1.331511,1.274266,0.643439
...,...,...,...,...
995,0.566616,1.096079,1.382377,0.250803
996,0.170809,0.203399,0.049574,1.242357
997,1.150370,1.131777,2.449691,0.287474
998,0.453345,0.887949,1.548059,1.346906


In [74]:
np.abs(data) > 3

Unnamed: 0,0,1,2,3
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
995,False,False,False,False
996,False,False,False,False
997,False,False,False,False
998,False,False,False,False


In [75]:
data[(np.abs(data) > 3)]

Unnamed: 0,0,1,2,3
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
...,...,...,...,...
995,,,,
996,,,,
997,,,,
998,,,,


In [76]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
98,0.15601,3.043914,0.560242,-1.04527
198,2.080735,-3.005857,0.53485,0.044706
274,0.845731,3.059499,-1.538307,-2.219145
378,0.692574,-1.167183,4.015471,1.250783
633,-0.193994,-0.263785,-3.022661,0.516834
656,1.121986,-1.026077,-3.400656,0.130422
777,3.121891,1.109081,-1.20406,1.460883
908,0.107851,-1.179264,-0.527532,3.226424


In [77]:
data[np.abs(data) > 3] = np.sign(data) * 3

In [79]:
data

Unnamed: 0,0,1,2,3
0,1.352202,-0.589229,-0.156550,0.597719
1,-0.652831,-0.211145,0.401640,-0.689107
2,-0.883552,0.801802,1.317318,0.501651
3,0.334485,-0.119986,0.347666,-0.179313
4,0.354423,-1.331511,-1.274266,-0.643439
...,...,...,...,...
995,0.566616,1.096079,1.382377,0.250803
996,0.170809,0.203399,0.049574,1.242357
997,-1.150370,-1.131777,-2.449691,0.287474
998,0.453345,-0.887949,-1.548059,-1.346906


In [81]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.033797,-0.000701,-0.000535,-0.000519
std,0.970585,0.982746,0.996348,0.994486
min,-2.768406,-3.0,-3.0,-2.895652
25%,-0.643992,-0.728007,-0.667906,-0.643626
50%,0.021747,-0.016911,0.058546,-0.02055
75%,0.691657,0.663832,0.666703,0.642842
max,3.0,3.0,3.0,3.0


In [90]:
np.broadcast_arrays()

[]

### 7.3 : String Manipulation

- Regular Expressions
> 1. regex

In [83]:
import re

text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

In [84]:
pattern = r'[A-Z0-9._+-]+@[A-Z0-9.-]+.[A-Z]{2,4}'  # text@text.pk

In [86]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [87]:
regex

re.compile(r'[A-Z0-9._+-]+@[A-Z0-9.-]+.[A-Z]{2,4}', re.IGNORECASE|re.UNICODE)

In [89]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [100]:
pattern = r'[0-9]+.[0-9]{2,4}' # $3.0
text = 'price of table is $399.025553'

regex = re.compile(pattern)
print(regex)
regex.findall(text)

re.compile('[0-9]+.[0-9]{2,4}')


['399.0255']

In [112]:
pattern = r'[A-Z0-9._+-]+@gmail.com'  # text@gmail.com

text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

regex = re.compile(pattern)
print(regex)
regex.findall(text)

re.compile('[A-Z0-9._+-]+@')


[]