# __DataFrame__
## A Pandas DataFrame is a 2D data structure, in which the data is alligned in tabular fashion in rows and columns. 

##  Characteristics:
### > Columns within the DataFrame can be of different types. 
### > Axes (rows and columns) are labelled. 
### > A pandas DataFrame can be created using various inputs like - 
###                                                                        Lists, dict, series, Numpy ndarrays, another dataframe. 


# *Creating an empty DataFrame*

In [2]:
import pandas as pd
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


# * Creating DataFrame from Lists*

In [5]:
import pandas as pd
listData = [10, 20, 30, 40, 50, 60, 70, 80]
df = pd.DataFrame(listData)
print(df)

    0
0  10
1  20
2  30
3  40
4  50
5  60
6  70
7  80


# *Creating DataFrame from Nested Lists*

In [10]:
import pandas as pd
listData = [['Alex', 10],['Bob', 12],['Mark', 11],['Tina',10]]
df = pd.DataFrame(data = listData, columns=['Student_Name', 'Student_Age'])
print(df)


# Specify the column type
df = pd.DataFrame(data = listData, columns = ['Student_Name', 'Student_Age'], dtype=float)
print(df)

  Student_Name  Student_Age
0         Alex           10
1          Bob           12
2         Mark           11
3         Tina           10
  Student_Name  Student_Age
0         Alex         10.0
1          Bob         12.0
2         Mark         11.0
3         Tina         10.0


# * Creating DataFrames from dict of ndarrays/ lists*

In [19]:
import pandas as pd
dictData = {'Name': ['Tom','Jack', 'Steve', 'Ria', 'Sally'], 'Age':[25, 22, 28, 20, 22]}
df = pd.DataFrame(dictData)
print(df)   # Observe that the rows have the default index assigned as 0/1/2/3/4

# Lets assign custom indices to the rows. 
dictData = {'Name': ['Tom','Jack', 'Steve', 'Ria', 'Sally'], 'Age':[25, 22, 28, 20, 22]}
df = pd.DataFrame(dictData, index=['rank1', 'rank2', 'rank3', 'rank4', 'rank5'])
print(df)


    Name  Age
0    Tom   25
1   Jack   22
2  Steve   28
3    Ria   20
4  Sally   22
        Name  Age
rank1    Tom   25
rank2   Jack   22
rank3  Steve   28
rank4    Ria   20
rank5  Sally   22


# *Create DataFrame from List of Dicts*

In [23]:
import pandas as pd
dataListDict = [{'Name':'Tom', 'Age': 26}, {'Name':'Alicia', 'Age': 22, 'City': 'Boston'}]
df = pd.DataFrame(dataListDict, index=['Intern_1', 'Intern_2'])
print(df)


          Age    City    Name
Intern_1   26     NaN     Tom
Intern_2   22  Boston  Alicia


# *Create a DataFrame from Dict of Series*

In [27]:
import pandas as pd
dataDictSeries = {'Emp1':pd.Series([1001, 'Ameya', 10000], index = ['EmpID', 'EmpName', 'EmpSal']),
                  'Emp2':pd.Series([1002, 'Anjanay', 12000.66, 'Mumbai'], index = ['EmpID', 'EmpName', 'EmpSal', 'EmpCity'])
                 }

df = pd.DataFrame(dataDictSeries)
print(df)        # Observe that the missing values are substituted with NaN. 


          Emp1     Emp2
EmpCity    NaN   Mumbai
EmpID     1001     1002
EmpName  Ameya  Anjanay
EmpSal   10000  12000.7


# *Adding a new column to the DataFrame*

In [38]:
import pandas as pd
dataDictSeries = {'Emp1':pd.Series([1001, 'Ameya', 10000], index = ['EmpID', 'EmpName', 'EmpSal']),
                  'Emp2':pd.Series([1002, 'Anjanay', 12000.66, 'Mumbai'], index = ['EmpID', 'EmpName', 'EmpSal', 'EmpCity'])
                 }

df = pd.DataFrame(dataDictSeries)


# Add a new column
df['Emp3'] = pd.Series([1003, 'Ambikay', 15000.55, 'Delhi'], index = ['EmpID', 'EmpName', 'EmpSal', 'EmpCity'])

# Print the data frame
print(df)        # Observe that the missing values are substituted with NaN. 


# Add a new column using existing columns in the DataFrame
df['Emp4'] = df['Emp1'] + df['Emp3']           # Respective entires get added or concatenated.
print(df)



          Emp1     Emp2     Emp3
EmpCity    NaN   Mumbai    Delhi
EmpID     1001     1002     1003
EmpName  Ameya  Anjanay  Ambikay
EmpSal   10000  12000.7  15000.5
          Emp1     Emp2     Emp3          Emp4
EmpCity    NaN   Mumbai    Delhi           NaN
EmpID     1001     1002     1003          2004
EmpName  Ameya  Anjanay  Ambikay  AmeyaAmbikay
EmpSal   10000  12000.7  15000.5       25000.5


# *Column Deletion in the DataFrame*

In [39]:
print(df)
# Lets delete the 'Emp4' column
# Method [1] - using del()
del df['Emp4']
print(df)

# Method [2] - using pop()
df.pop('Emp3')
print(df)

          Emp1     Emp2     Emp3          Emp4
EmpCity    NaN   Mumbai    Delhi           NaN
EmpID     1001     1002     1003          2004
EmpName  Ameya  Anjanay  Ambikay  AmeyaAmbikay
EmpSal   10000  12000.7  15000.5       25000.5
          Emp1     Emp2     Emp3
EmpCity    NaN   Mumbai    Delhi
EmpID     1001     1002     1003
EmpName  Ameya  Anjanay  Ambikay
EmpSal   10000  12000.7  15000.5
          Emp1     Emp2
EmpCity    NaN   Mumbai
EmpID     1001     1002
EmpName  Ameya  Anjanay
EmpSal   10000  12000.7


# *Specific Row Selection *

In [45]:
import pandas as pd
dataDictSeries = {'Emp1':pd.Series([1001, 'Ameya', 10000], index = ['EmpID', 'EmpName', 'EmpSal']),
                  'Emp2':pd.Series([1002, 'Anjanay', 12000.66, 'Mumbai'], index = ['EmpID', 'EmpName', 'EmpSal', 'EmpCity'])
                 }

df = pd.DataFrame(dataDictSeries)

# Print the entire DataFrame
print(df)

print("+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-")
# Print the specific row in the DataFrame - Selection by label name
print(df.loc['EmpName'])

print("+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-")
# Print the specific row in the DataFrame - Selection by integer index
print(df.iloc[2])


          Emp1     Emp2
EmpCity    NaN   Mumbai
EmpID     1001     1002
EmpName  Ameya  Anjanay
EmpSal   10000  12000.7
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-
Emp1      Ameya
Emp2    Anjanay
Name: EmpName, dtype: object
+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-
Emp1      Ameya
Emp2    Anjanay
Name: EmpName, dtype: object


# * Slice Rows*
## Multiple rows can be selected using the ":" operator. 

In [48]:
import pandas as pd
dataDictSeries = {'Emp1':pd.Series([1001, 'Ameya', 10000], index = ['EmpID', 'EmpName', 'EmpSal']),
                  'Emp2':pd.Series([1002, 'Anjanay', 12000.66, 'Mumbai'], index = ['EmpID', 'EmpName', 'EmpSal', 'EmpCity'])
                 }

df = pd.DataFrame(dataDictSeries)    
print(df[2:4])   # Slice operation, last index not included

          Emp1     Emp2
EmpName  Ameya  Anjanay
EmpSal   10000  12000.7


# *Addition of new rows to the DataFrame*

In [60]:
import pandas as pd
df1 = pd.DataFrame([[1001, 'Ameya', 10000], [1002, 'Anjanay', 12000.66]], columns = ['EmpID', 'EmpName', 'EmpSal'], index=['E1', 'E2'])
df2 = pd.DataFrame([[1003, 'Ambikay', 15550.50]], columns=['EmpID', 'EmpName', 'EmpSal'], index=['E3'])

df1 = df1.append(df2)     # Append the record in the end
print(df1)


    EmpID  EmpName    EmpSal
E1   1001    Ameya  10000.00
E2   1002  Anjanay  12000.66
E3   1003  Ambikay  15550.50
