### PySpark Configuration

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

### Read CSV

In [5]:
df = spark.read.format("csv").load("/content/sample_data/california_housing_test.csv")
df.show(5)

+-----------+---------+------------------+-----------+--------------+-----------+----------+-------------+------------------+
|        _c0|      _c1|               _c2|        _c3|           _c4|        _c5|       _c6|          _c7|               _c8|
+-----------+---------+------------------+-----------+--------------+-----------+----------+-------------+------------------+
|  longitude| latitude|housing_median_age|total_rooms|total_bedrooms| population|households|median_income|median_house_value|
|-122.050000|37.370000|         27.000000|3885.000000|    661.000000|1537.000000|606.000000|     6.608500|     344700.000000|
|-118.300000|34.260000|         43.000000|1510.000000|    310.000000| 809.000000|277.000000|     3.599000|     176500.000000|
|-117.810000|33.780000|         27.000000|3589.000000|    507.000000|1484.000000|495.000000|     5.793400|     270500.000000|
|-118.360000|33.820000|         28.000000|  67.000000|     15.000000|  49.000000| 11.000000|     6.135900|     330000.

In [7]:
df = spark.read.option('header','True').csv("/content/sample_data/california_housing_test.csv")
df.show()

+-----------+---------+------------------+-----------+--------------+-----------+-----------+-------------+------------------+
|  longitude| latitude|housing_median_age|total_rooms|total_bedrooms| population| households|median_income|median_house_value|
+-----------+---------+------------------+-----------+--------------+-----------+-----------+-------------+------------------+
|-122.050000|37.370000|         27.000000|3885.000000|    661.000000|1537.000000| 606.000000|     6.608500|     344700.000000|
|-118.300000|34.260000|         43.000000|1510.000000|    310.000000| 809.000000| 277.000000|     3.599000|     176500.000000|
|-117.810000|33.780000|         27.000000|3589.000000|    507.000000|1484.000000| 495.000000|     5.793400|     270500.000000|
|-118.360000|33.820000|         28.000000|  67.000000|     15.000000|  49.000000|  11.000000|     6.135900|     330000.000000|
|-119.670000|36.330000|         19.000000|1241.000000|    244.000000| 850.000000| 237.000000|     2.937500|    

### ***Defining and cleaning Dataframe***

In [None]:
mydata = spark.read.format("csv").option("header","true").load("original.csv")

In [None]:
mydata

DataFrame[id: string, first_name: string, last_name: string, gender: string, City: string, JobTitle: string, Salary: string, Latitude: string, Longitude: string]

#### show()

In [None]:
mydata.show(10)

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52|39.9947462|116.3397725|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.6489954|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.4266145| -6.1644997|
|  7|     Masha|    Divers|Female|         Dachun|     

#### When().otherwise()

In [None]:
from pyspark.sql.functions import *
mydata2 = mydata.withColumn("clean_city", when(mydata.City.isNull(),'Unknown').otherwise(mydata.City))
mydata2.show(10)

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|     clean_city|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|      Nowa Ruda|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|         Bulgan|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52|39.9947462|116.3397725|        Unknown|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|  Divnomorskoye|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.6489954|      Mytishchi|
|  6|     Maris|      Folk|Femal

#### Filter()

In [None]:
mydata2 = mydata2.filter(mydata2.JobTitle.isNotNull())
mydata2.show(10)

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|     clean_city|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|      Nowa Ruda|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|         Bulgan|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|  Divnomorskoye|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.6489954|      Mytishchi|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.4266145| -6.1644997|Kinsealy-Drinan|
|  8|   Goddart|     Flear|  Mal

#### Creating a new column - withColumn()

In [None]:
mydata2 =  mydata2.withColumn("CleanSalary",mydata2.Salary.substr(2,100).cast('float'))
mydata2.show(10)

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+-----------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|     clean_city|CleanSalary|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+-----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|      Nowa Ruda|   57438.18|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|         Bulgan|    62846.6|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|  Divnomorskoye|   61489.23|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.6489954|      Mytishchi|   63863.09|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Enginee

#### Mean - avg()

In [None]:
mean = mydata2.groupBy().avg("cleanSalary")
mean.show()

+-----------------+
| avg(cleanSalary)|
+-----------------+
|55516.32088199837|
+-----------------+



In [None]:
mean = mydata2.groupBy().avg("cleanSalary").take(1)[0][0]
mean

55516.32088199837

#### Adding a new column showing the literal value of the clean salary

In [None]:
from pyspark.sql.functions import lit
lit(mean)

Column<'55516.32088199837'>

In [None]:
mydata2 = mydata2.withColumn("NewSalary", when(mydata2.CleanSalary.isNull(),lit(mean)).otherwise(mydata2.CleanSalary))
mydata2.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+-----------+----------------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|     clean_city|CleanSalary|       NewSalary|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+-----------+----------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|      Nowa Ruda|   57438.18|   57438.1796875|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|         Bulgan|    62846.6|   62846.6015625|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|  Divnomorskoye|   61489.23|  61489.23046875|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.6489

#### Numpy

In [None]:
import numpy as np
latitudes = mydata2.select("Latitude")

In [None]:
latitudes.show(10)

+----------+
|  Latitude|
+----------+
|50.5774075|
|48.8231572|
|44.5047212|
|      null|
|53.4266145|
|45.1905186|
| 32.027934|
|  4.272793|
|     -5.85|
| 39.172378|
+----------+
only showing top 10 rows



In [None]:
latitudes = latitudes.filter(latitudes.Latitude.isNotNull())
latitudes.show(10)

+----------+
|  Latitude|
+----------+
|50.5774075|
|48.8231572|
|44.5047212|
|53.4266145|
|45.1905186|
| 32.027934|
|  4.272793|
|     -5.85|
| 39.172378|
|49.8151822|
+----------+
only showing top 10 rows



In [None]:
latitudes = latitudes.withColumn('latitude2',latitudes.Latitude.cast("float")).select('latitude2')
median = np.median(latitudes.collect())
median

31.93397331237793

In [None]:
mydata2 = mydata2.withColumn("Lat",when(mydata2.Latitude.isNull(),lit(median)).otherwise(mydata2.Latitude))
mydata2.show(10)

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+-----------+----------------+-----------------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|     clean_city|CleanSalary|       NewSalary|              Lat|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+-----------+----------------+-----------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|      Nowa Ruda|   57438.18|   57438.1796875|       50.5774075|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|         Bulgan|    62846.6|   62846.6015625|       48.8231572|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|  Divnomorskoye|   61489.23|  61489.23046875|       44.5

### Using SQL Functions

#### GroupBy gender

In [None]:
import pyspark.sql.functions as sqlfunc
genders = mydata2.groupBy("gender").agg(sqlfunc.avg("NewSalary").alias("AvgSalary"))

In [None]:
genders.show()

+------+------------------+
|gender|         AvgSalary|
+------+------------------+
|Female|55677.250125558036|
|  Male| 55361.09385573019|
+------+------------------+



#### Sort City Averge

In [None]:
cityAvg = mydata2.groupBy("City").agg(sqlfunc.avg("NewSalary").alias('avgSalary'))
cityAvg = cityAvg.sort(col("avgSalary").desc())

In [None]:
cityAvg.show(10)

+--------------+-------------+
|          City|    avgSalary|
+--------------+-------------+
|     Mesopotam|  99948.28125|
|    Zhongcheng| 99942.921875|
|        Caxias|99786.3984375|
|   Karangtawar|99638.9921875|
|     Itabaiana|  99502.15625|
|        Pasian|  99421.34375|
|        Webuye| 99368.546875|
|   Yuktae-dong| 99250.828125|
|        Zinder|  99222.84375|
|Timiryazevskiy|   99142.9375|
+--------------+-------------+
only showing top 10 rows



### Types and Functions

In [8]:
from pyspark.sql.types import *
schema = StructType(
    [
      StructField('Name',StringType()),
      StructField('Roll',IntegerType()),
      StructField('Batch',StringType()),
    ]
)

In [9]:
data = [
    ("Vinayak",1,"CSE"),
    ("Neel",2,"CSIT"),
    ("PK",3,"CSE"),
    ("PK",3,"CSE")
]

df = spark.createDataFrame(data = data, schema = schema)
df.show(),df.printSchema()

+-------+----+-----+
|   Name|Roll|Batch|
+-------+----+-----+
|Vinayak|   1|  CSE|
|   Neel|   2| CSIT|
|     PK|   3|  CSE|
|     PK|   3|  CSE|
+-------+----+-----+

root
 |-- Name: string (nullable = true)
 |-- Roll: integer (nullable = true)
 |-- Batch: string (nullable = true)



(None, None)

In [10]:
df.count(),df.dropDuplicates().count()

(4, 3)

In [11]:
df.distinct().show()

+-------+----+-----+
|   Name|Roll|Batch|
+-------+----+-----+
|     PK|   3|  CSE|
|Vinayak|   1|  CSE|
|   Neel|   2| CSIT|
+-------+----+-----+



In [12]:
df.select("Name", "batch").show()

+-------+-----+
|   Name|batch|
+-------+-----+
|Vinayak|  CSE|
|   Neel| CSIT|
|     PK|  CSE|
|     PK|  CSE|
+-------+-----+



### JSON

In [13]:
import json
data = {
    "array": [
      1,
      2,
      3
    ],
    "boolean": True,
    "color": "gold",
    "number": 123,
    "object": {
      "a": "b",
      "c": "d"
    },
    "string": "Hello World"
  }


# data {
#     "fruit": "Apple",
#     "size": "Large",
#     "color": "Red"
# }

json_object = json.dumps(data, indent=4)
with open("sample.json", "w") as outfile:
    outfile.write(json_object)

In [14]:
df = spark.read.json("sample.json")
df.printSchema()
df.show()

root
 |-- _corrupt_record: string (nullable = true)



AnalysisException: Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the
referenced columns only include the internal corrupt record column
(named _corrupt_record by default). For example:
spark.read.schema(schema).json(file).filter($"_corrupt_record".isNotNull).count()
and spark.read.schema(schema).json(file).select("_corrupt_record").show().
Instead, you can cache or save the parsed results and then send the same query.
For example, val df = spark.read.schema(schema).json(file).cache() and then
df.filter($"_corrupt_record".isNotNull).count().

In [15]:
df = spark.read.option('multiline',"true").json("sample.json")

In [16]:
df.show()

+---------+-------+-----+------+------+-----------+
|    array|boolean|color|number|object|     string|
+---------+-------+-----+------+------+-----------+
|[1, 2, 3]|   true| gold|   123|{b, d}|Hello World|
+---------+-------+-----+------+------+-----------+



### Functions

In [17]:
import csv
with open('dummy.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    field = ["Country name", "Country Code", "Year", "Value"]

    writer.writerow(field)
    writer.writerow(["India","In01",2000,90])
    writer.writerow(["USA", "US02", 2001,80])
    writer.writerow(["Chine","CH31",2002,70])
    writer.writerow(["Germany", "Ge06", 2003,83])
    writer.writerow(["Nepal","Ne05",2004,40])


In [18]:
df = spark.read.options(delimiter = ',', inferschema = True, header= True ).csv("/content/dummy.csv")
df.show()

+------------+------------+----+-----+
|Country name|Country Code|Year|Value|
+------------+------------+----+-----+
|       India|        In01|2000|   90|
|         USA|        US02|2001|   80|
|       Chine|        CH31|2002|   70|
|     Germany|        Ge06|2003|   83|
|       Nepal|        Ne05|2004|   40|
+------------+------------+----+-----+



In [19]:
df.printSchema()

root
 |-- Country name: string (nullable = true)
 |-- Country Code: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Value: integer (nullable = true)



In [20]:
df.columns

['Country name', 'Country Code', 'Year', 'Value']

In [21]:
df.count()

5

In [22]:
df.show(2)

+------------+------------+----+-----+
|Country name|Country Code|Year|Value|
+------------+------------+----+-----+
|       India|        In01|2000|   90|
|         USA|        US02|2001|   80|
+------------+------------+----+-----+
only showing top 2 rows



In [23]:
df.take(2)

[Row(Country name='India', Country Code='In01', Year=2000, Value=90),
 Row(Country name='USA', Country Code='US02', Year=2001, Value=80)]

In [24]:
import csv
with open('dummy2.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    field = ["Country name", "Country Code", "Year", "Value"]

    writer.writerow(field)
    writer.writerow(["India","In01",2000,90])
    writer.writerow(["India","In01",2001,90])
    writer.writerow(["India","In01",2005,90])
    writer.writerow(["USA", "US02", 2001,80])
    writer.writerow(["Chine","CH31",2002,70])
    writer.writerow(["Germany", "Ge06", 2003,83])
    writer.writerow(["Nepal","Ne05",2000,40])


In [25]:
df = spark.read.options(delimiter = ',', inferschema = True, header= True ).csv("/content/dummy2.csv")
df.show()

+------------+------------+----+-----+
|Country name|Country Code|Year|Value|
+------------+------------+----+-----+
|       India|        In01|2000|   90|
|       India|        In01|2001|   90|
|       India|        In01|2005|   90|
|         USA|        US02|2001|   80|
|       Chine|        CH31|2002|   70|
|     Germany|        Ge06|2003|   83|
|       Nepal|        Ne05|2000|   40|
+------------+------------+----+-----+



In [26]:
df.dropDuplicates().show()

+------------+------------+----+-----+
|Country name|Country Code|Year|Value|
+------------+------------+----+-----+
|       Nepal|        Ne05|2000|   40|
|       India|        In01|2000|   90|
|         USA|        US02|2001|   80|
|       Chine|        CH31|2002|   70|
|     Germany|        Ge06|2003|   83|
|       India|        In01|2005|   90|
|       India|        In01|2001|   90|
+------------+------------+----+-----+



In [27]:
df.show()

+------------+------------+----+-----+
|Country name|Country Code|Year|Value|
+------------+------------+----+-----+
|       India|        In01|2000|   90|
|       India|        In01|2001|   90|
|       India|        In01|2005|   90|
|         USA|        US02|2001|   80|
|       Chine|        CH31|2002|   70|
|     Germany|        Ge06|2003|   83|
|       Nepal|        Ne05|2000|   40|
+------------+------------+----+-----+



In [28]:
df.filter(df["Country name"] == "India").show()

+------------+------------+----+-----+
|Country name|Country Code|Year|Value|
+------------+------------+----+-----+
|       India|        In01|2000|   90|
|       India|        In01|2001|   90|
|       India|        In01|2005|   90|
+------------+------------+----+-----+



In [29]:
from pyspark.sql.functions import col
df.filter(col("Country name") == "India").show()

+------------+------------+----+-----+
|Country name|Country Code|Year|Value|
+------------+------------+----+-----+
|       India|        In01|2000|   90|
|       India|        In01|2001|   90|
|       India|        In01|2005|   90|
+------------+------------+----+-----+



In [30]:
df.filter((df["Country name"] == "India") &  (df["Year"] == 2000) | (df["Year"] == 2005)).show()

+------------+------------+----+-----+
|Country name|Country Code|Year|Value|
+------------+------------+----+-----+
|       India|        In01|2000|   90|
|       India|        In01|2005|   90|
+------------+------------+----+-----+



In [31]:
df.show()


+------------+------------+----+-----+
|Country name|Country Code|Year|Value|
+------------+------------+----+-----+
|       India|        In01|2000|   90|
|       India|        In01|2001|   90|
|       India|        In01|2005|   90|
|         USA|        US02|2001|   80|
|       Chine|        CH31|2002|   70|
|     Germany|        Ge06|2003|   83|
|       Nepal|        Ne05|2000|   40|
+------------+------------+----+-----+



In [32]:
df.select(["Country Code" , "Year"]).show()

+------------+----+
|Country Code|Year|
+------------+----+
|        In01|2000|
|        In01|2001|
|        In01|2005|
|        US02|2001|
|        CH31|2002|
|        Ge06|2003|
|        Ne05|2000|
+------------+----+



In [33]:
df.select("Country Code" , "Year").show()

+------------+----+
|Country Code|Year|
+------------+----+
|        In01|2000|
|        In01|2001|
|        In01|2005|
|        US02|2001|
|        CH31|2002|
|        Ge06|2003|
|        Ne05|2000|
+------------+----+



In [34]:
df.select("*").orderBy("year").show()

+------------+------------+----+-----+
|Country name|Country Code|Year|Value|
+------------+------------+----+-----+
|       Nepal|        Ne05|2000|   40|
|       India|        In01|2000|   90|
|       India|        In01|2001|   90|
|         USA|        US02|2001|   80|
|       Chine|        CH31|2002|   70|
|     Germany|        Ge06|2003|   83|
|       India|        In01|2005|   90|
+------------+------------+----+-----+



In [35]:
df.withColumnRenamed("Country name","Country").withColumnRenamed("Country Code","Code").show()

+-------+----+----+-----+
|Country|Code|Year|Value|
+-------+----+----+-----+
|  India|In01|2000|   90|
|  India|In01|2001|   90|
|  India|In01|2005|   90|
|    USA|US02|2001|   80|
|  Chine|CH31|2002|   70|
|Germany|Ge06|2003|   83|
|  Nepal|Ne05|2000|   40|
+-------+----+----+-----+



### RDD

In [36]:
sc = spark.sparkContext

In [38]:
tx = "Once, there was a dog who wandered the streets night and day in search of food. One day, he found a big juicy bone, and he immediately grabbed it in his mouth and took it home. On his way home, he crossed a river and saw another dog with a bone in its mouth. He wanted that bone for himself, too. But as he opened his mouth, the bone he was biting fell into the river and sank. That night, he went home hungry."
rdd1 = sc.parallelize(tx.split())

In [39]:
print(rdd1)

ParallelCollectionRDD[175] at readRDDFromFile at PythonRDD.scala:274


In [42]:
rdd2 = rdd1.map(lambda x: (x,1))
rdd2.collect()

[('Once,', 1),
 ('there', 1),
 ('was', 1),
 ('a', 1),
 ('dog', 1),
 ('who', 1),
 ('wandered', 1),
 ('the', 1),
 ('streets', 1),
 ('night', 1),
 ('and', 1),
 ('day', 1),
 ('in', 1),
 ('search', 1),
 ('of', 1),
 ('food.', 1),
 ('One', 1),
 ('day,', 1),
 ('he', 1),
 ('found', 1),
 ('a', 1),
 ('big', 1),
 ('juicy', 1),
 ('bone,', 1),
 ('and', 1),
 ('he', 1),
 ('immediately', 1),
 ('grabbed', 1),
 ('it', 1),
 ('in', 1),
 ('his', 1),
 ('mouth', 1),
 ('and', 1),
 ('took', 1),
 ('it', 1),
 ('home.', 1),
 ('On', 1),
 ('his', 1),
 ('way', 1),
 ('home,', 1),
 ('he', 1),
 ('crossed', 1),
 ('a', 1),
 ('river', 1),
 ('and', 1),
 ('saw', 1),
 ('another', 1),
 ('dog', 1),
 ('with', 1),
 ('a', 1),
 ('bone', 1),
 ('in', 1),
 ('its', 1),
 ('mouth.', 1),
 ('He', 1),
 ('wanted', 1),
 ('that', 1),
 ('bone', 1),
 ('for', 1),
 ('himself,', 1),
 ('too.', 1),
 ('But', 1),
 ('as', 1),
 ('he', 1),
 ('opened', 1),
 ('his', 1),
 ('mouth,', 1),
 ('the', 1),
 ('bone', 1),
 ('he', 1),
 ('was', 1),
 ('biting', 1),
 ('f