# Pandas

There are two core objects in pandas: the **DataFrame** and the **Series**. A DataFrame is a table. A Series is a sequence of data values. A Series is, in essence, a single column of a DataFrame.


In [2]:
import pandas as pd

In [4]:
pd.Series([1, 2, 3, 4, 5])

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [24]:
#df = pd.read_csv("file.csv")
df = pd.DataFrame({'Yes': [50, 21, 15], 'No': [131, 2, 16]})

display(df.head())                # grabs the first five rows
print(df.shape)

display( df.Yes )                 # col  we can: df['Yes']

print(df['Yes'][0], "\n")         # element

display(df.iloc[0])                 # row  or [r,c]  or [[1,2,5], 0] 


Unnamed: 0,Yes,No
0,50,131
1,21,2
2,15,16


(3, 2)


0    50
1    21
2    15
Name: Yes, dtype: int64

50 



Yes     50
No     131
Name: 0, dtype: int64

In [30]:
df.loc[df.Yes > 20]

Unnamed: 0,Yes,No
0,50,131
1,21,2


In [18]:
df = pd.DataFrame()
print(df.index, df.columns)   # Index([], dtype='object')  Index([], dtype='object')


df = pd.DataFrame([1,3,2], columns=['A'])
print(df.index, df.columns)   # Index([], dtype='object')  Index([], dtype='object')

df = pd.DataFrame()
df['A'] = [1,2,3]
print(df.index, df.columns)   # Index([], dtype='object')  Index([], dtype='object')

df = pd.DataFrame({ "Name": ["Jon", "Mia"],
                    "Age" : [43,     39  ]}, index=['r0', 1])
df

Index([], dtype='object') Index([], dtype='object')
RangeIndex(start=0, stop=3, step=1) Index(['A'], dtype='object')
RangeIndex(start=0, stop=3, step=1) Index(['A'], dtype='object')


Unnamed: 0,Name,Age
r0,Jon,43
1,Mia,39


In [102]:
df = pd.DataFrame({'a':[1,2], 'b':[3,4]})
display(df)

df.a = df.a.map(lambda x: x - 50)
display(df)

Unnamed: 0,a,b
0,1,3
1,2,4


Unnamed: 0,a,b
0,-49,3
1,-48,4


In [127]:
df = pd.DataFrame({'a': [ 1, 2],
                   'b': [-1,-2]})
display(df)
df = df.rename(columns = {'a': 'b', 'b': 'a'})
display(df)

df.at[0, 'c']

Unnamed: 0,a,b
0,1,-1
1,2,-2


Unnamed: 0,b,a
0,1,-1
1,2,-2


KeyError: 'c'

In [112]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

#create DataFrame
df = pd.DataFrame({'team':   ['A', 'A', 'B', 'B', 'B', 'B', 'C', 'C'],
                   'points': [25,  12,   15,  14,  19,  23,  25,  29]})
print(df)

onehot    = OneHotEncoder(handle_unknown='ignore')
onehot_df = pd.DataFrame( onehot.fit_transform(df[['team']]).toarray(), columns=['A','B','C'] )

final_df = df.join(onehot_df)  #merge one-hot encoded columns back with original DataFrame
print(final_df)

#onehot.fit_transform(df[['team']])

  team  points
0    A      25
1    A      12
2    B      15
3    B      14
4    B      19
5    B      23
6    C      25
7    C      29
  team  points    A    B    C
0    A      25  1.0  0.0  0.0
1    A      12  1.0  0.0  0.0
2    B      15  0.0  1.0  0.0
3    B      14  0.0  1.0  0.0
4    B      19  0.0  1.0  0.0
5    B      23  0.0  1.0  0.0
6    C      25  0.0  0.0  1.0
7    C      29  0.0  0.0  1.0


In [121]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

X, y = load_iris(return_X_y=True)
print(X[0])
print(y)

clf = LogisticRegression(random_state=0, max_iter=1000).fit(X, y)
clf.score(X,y)

[5.1 3.5 1.4 0.2]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


0.9733333333333334