In [23]:
import numpy as np
import pandas as pd

# Series creation: 

In [None]:
# using lists

In [13]:
l = [10,20,30,40]

In [14]:
pd.Series(l)

0    10
1    20
2    30
3    40
dtype: int64

In [15]:
pd.Series(l,index=['i','ii','iii','iv'])

i      10
ii     20
iii    30
iv     40
dtype: int64

In [10]:
s2 = pd.Series([68, 83, 112, 68], index=["alice", "bob", "charles", "darwin"])
print(s2)

alice       68
bob         83
charles    112
darwin      68
dtype: int64


In [None]:
using dictionaries - it takes eys as index

In [4]:
weights = {"alice": 68, "bob": 83, "colin": 86,
"darwin": 68}

In [8]:
s3 = pd.Series(weights)
print(s3)

alice     68
bob       83
colin     86
darwin    68
dtype: int64


In [6]:
# control the order and elements to include

In [7]:
s4 = pd.Series(weights, index = ["colin", "alice"])
print(s4)

colin    86
alice    68
dtype: int64


In [None]:
# auto allignement of index

In [11]:
s2+s3  
# s2 -> has charles and s3 -> has colin not the other

alice      136.0
bob        166.0
charles      NaN
colin        NaN
darwin     136.0
dtype: float64

In [12]:
# initializing a series with a scalar

In [13]:
meaning = pd.Series(42, ["life", "universe","everything"])
print(meaning)

life          42
universe      42
everything    42
dtype: int64


In [14]:
# a series can have a name:

In [15]:
s6 = pd.Series([83, 68], index=["bob", "alice"],name="weights")
print(s6)

bob      83
alice    68
Name: weights, dtype: int64


# DataFrame creation:

In [16]:
d = {'Name':['rama','balaji','srinivaas'],'Age':[50,60,25]}

In [17]:
pd.DataFrame(d)

Unnamed: 0,Name,Age
0,rama,50
1,balaji,60
2,srinivaas,25


In [24]:
df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),columns=['a', 'b', 'c'])

In [25]:
df2

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [None]:
# create dataframe from a dictionary of Seriea

In [16]:
people_dict = {

"weight": pd.Series([68, 83, 112],index=["alice","bob", "charles"]),

"birthyear": pd.Series([1984, 1985, 1992],index=["bob", "alice", "charles"], name="year"),

"children": pd.Series([0, 3], index=["charles","bob"]),

"hobby": pd.Series(["Biking", "Dancing"],index=["alice", "bob"]),
}

In [22]:
people = pd.DataFrame(people_dict)
people

Unnamed: 0,weight,birthyear,children,hobby
alice,68,1985,,Biking
bob,83,1984,3.0,Dancing
charles,112,1992,0.0,


In [23]:
# Include columns and/or rows and guarantee order

In [26]:
d2 = pd.DataFrame(
...    people_dict,columns=["birthyear", "weight", "height"],index=["bob", "alice", "eugene"])
print(d2)

        birthyear  weight height
bob        1984.0    83.0    NaN
alice      1985.0    68.0    NaN
eugene        NaN     NaN    NaN


In [31]:
# Adding & Removing new columns to dataframe

In [35]:
# Adds a new column "age"
people["age"] = 2016 - people["birthyear"]
people

Unnamed: 0,weight,birthyear,children,hobby,age
alice,68,1985,,Biking,31
bob,83,1984,3.0,Dancing,32
charles,112,1992,0.0,,24


In [36]:
# Adds another column "over 30"
people["over 30"] = people["age"] > 30
people

Unnamed: 0,weight,birthyear,children,hobby,age,over 30
alice,68,1985,,Biking,31,True
bob,83,1984,3.0,Dancing,32,True
charles,112,1992,0.0,,24,False


In [37]:
# Removes "birthyear" and "children" columns
birthyears = people.pop("birthyear") # deletes and returns the col.
del people["children"]  # only deletes the col.

In [38]:
birthyears

alice      1985
bob        1984
charles    1992
Name: birthyear, dtype: int64

In [39]:
people

Unnamed: 0,weight,hobby,age,over 30
alice,68,Biking,31,True
bob,83,Dancing,32,True
charles,112,,24,False


In [40]:
# alice is missing, eugene is ignored
people["pets"] = pd.Series({"bob": 0,"charles": 5,"eugene":1})
people

Unnamed: 0,weight,hobby,age,over 30,pets
alice,68,Biking,31,True,
bob,83,Dancing,32,True,0.0
charles,112,,24,False,5.0


# insert()

In [41]:
# Add a new column using insert method after an existing column
# syntax: dataframe.insert(loc, column, value, allow_duplicates=False)
# loc : int - Insertion index. Must verify 0 <= loc <= len(columns).

In [42]:
people.insert(1, "height", [172, 181, 185])
people

Unnamed: 0,weight,height,hobby,age,over 30,pets
alice,68,172,Biking,31,True,
bob,83,181,Dancing,32,True,0.0
charles,112,185,,24,False,5.0


# assign()

In [None]:
# assign(col_tag = func(col_in_dataframe))

In [50]:
people.assign(body_mass_index = lambda df:df["weight"]/ (df["height"] / 100) ** 2).assign(
    overweight = lambda df:df["body_mass_index"] > 25)

Unnamed: 0,weight,height,hobby,age,over 30,pets,body_mass_index,overweight
alice,68,172,Biking,31,True,,22.985398,False
bob,83,181,Dancing,32,True,0.0,25.335002,True
charles,112,185,,24,False,5.0,32.724617,True


In [59]:
people['body_mass_index'] = people["weight"]/(people["height"] / 100) ** 2
people['overweight'] = people['body_mass_index']>25

In [60]:
people

Unnamed: 0,weight,height,hobby,age,over 30,pets,body_mass_index,overweight
alice,68,172,Biking,31,True,,22.985398,False
bob,83,181,Dancing,32,True,0.0,25.335002,True
charles,112,185,,24,False,5.0,32.724617,True


# stringIO(data)

In [24]:
from io import StringIO, BytesIO

In [25]:
#StringIO() -> Text I/O implementation using an in-memory buffer.

In [26]:
data = 'col1,col2,col3\n'+'x,y,1\n'+'a,b,2\n'+'c,d,3'

In [27]:
type(data)

str

In [28]:
df = pd.read_csv(StringIO(data))
df

Unnamed: 0,col1,col2,col3
0,x,y,1
1,a,b,2
2,c,d,3


In [29]:
df.to_csv('test.csv')

In [None]:
# to assign different data types for diff. cols.

In [39]:
data2 = ('a,b,c,d\n'
           '1,2,3,4\n'
           '5,6,7,8\n'
           '9,10,11,12')

In [40]:
df = pd.read_csv(StringIO(data2),dtype={'b':int,'c':np.float,'a':'Int64'})

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  df = pd.read_csv(StringIO(data2),dtype={'b':int,'c':np.float,'a':'Int64'})


In [41]:
df

Unnamed: 0,a,b,c,d
0,1,2,3.0,4
1,5,6,7.0,8
2,9,10,11.0,12


In [None]:
# to make one colun as index

In [42]:
data3 = ('index,a,b,c\n'
            '4,apple,bat,5.7\n'
            '8,orange,cow,10')

In [44]:
pd.read_csv(StringIO(data3),index_col=0)

Unnamed: 0_level_0,a,b,c
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,apple,bat,5.7
8,orange,cow,10.0


In [45]:
data4 = ('a,b,c\n'
            '4,apple,bat,5.7\n'
            '8,orange,cow,10') # when the first value is a number :

In [46]:
pd.read_csv(StringIO(data4))

Unnamed: 0,a,b,c
4,apple,bat,5.7
8,orange,cow,10.0


In [47]:
# to rectify this:

In [48]:
pd.read_csv(StringIO(data4),index_col=False)

Unnamed: 0,a,b,c
0,4,apple,bat
1,8,orange,cow


In [49]:
# Quoting and Escape Characters. when we want to skip some characters
# Very usefull in NLP

In [52]:
data5 = 'a,b\n"hello, \\Bob\\", nice to see you",5'

In [53]:
pd.read_csv(StringIO(data5),escapechar='\\')

Unnamed: 0,a,b
0,"hello, Bob"", nice to see you",5
