# СмолькинаЕВ 6408
# Лабораторная работа №1 по курсу BigData

In [1]:
from pyspark import SparkContext, SparkConf
app_name = "Lab1"
conf = SparkConf().setAppName(app_name).setMaster('local[1]')
sc = SparkContext(conf=conf)
sc



In [2]:
from typing import NamedTuple
from datetime import datetime

def initStation(stations):
    class Station(NamedTuple):
        station_id: int
        name: str
        lat: float
        long: float
        dockcount: int
        landmark: str
        installation: str
    
    for station in stations:
        yield Station(
            station_id = int(station[0]),
            name = station[1],
            lat = float(station[2]),
            long = float(station[3]),
            dockcount = int(station[4]),
            landmark = station[5],
            installation = datetime.strptime(station[6], '%m/%d/%Y')
        )
        
def initTrip(trips):
    class Trip(NamedTuple):
        trip_id: int
        duration: int
        start_date: datetime
        start_station_name: str
        start_station_id: int
        end_date: datetime
        end_station_name: str
        end_station_id: int
        bike_id: int
        subscription_type: str
        zip_code: str
        
    for trip in trips:
        yield Trip(                             
            trip_id = int(trip[0]),
            duration = int(trip[1]) if trip[1] != '' else 0,
            start_date = datetime.strptime(trip[2], '%m/%d/%Y %H:%M') if trip[2] != '' else None,
            start_station_name = trip[3],
            start_station_id = int(trip[4]),
            end_date = datetime.strptime(trip[5], '%m/%d/%Y %H:%M') if trip[5] != '' else None,
            end_station_name = trip[6],
            end_station_id = int(trip[7]),
            bike_id = int(trip[8]),
            subscription_type = trip[9],
            zip_code = trip[10]
        )

In [3]:
def GetDataFromTable(data):
    columns = data.first()
    table = data.filter(lambda row: row != columns)\
                .map(lambda row: row.split(","))
    return columns, table


In [4]:
tripData = sc.textFile("trip.csv")
stationData = sc.textFile("station.csv")
print("Trip data\n") 
for data in tripData.take(2):
    print("\n", data)
print("\n\nStation data\n") 
for data in stationData.take(2):
    print("\n", data)

Trip data


 id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code

 4576,63,8/29/2013 14:13,South Van Ness at Market,66,8/29/2013 14:14,South Van Ness at Market,66,520,Subscriber,94127


Station data


 id,name,lat,long,dock_count,city,installation_date

 2,San Jose Diridon Caltrain Station,37.329732,-121.90178200000001,27,San Jose,8/6/2013


In [5]:
tripColumns, trips = GetDataFromTable(tripData)
stationCoolumns, stations = GetDataFromTable(stationData)
stations.first()

['2',
 'San Jose Diridon Caltrain Station',
 '37.329732',
 '-121.90178200000001',
 '27',
 'San Jose',
 '8/6/2013']

In [6]:
!head station.csv

id,name,lat,long,dock_count,city,installation_date
2,San Jose Diridon Caltrain Station,37.329732,-121.90178200000001,27,San Jose,8/6/2013
3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013
4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013
5,Adobe on Almaden,37.331415,-121.8932,19,San Jose,8/5/2013
6,San Pedro Square,37.336721000000004,-121.894074,15,San Jose,8/7/2013
7,Paseo de San Antonio,37.333798,-121.88694299999999,15,San Jose,8/7/2013
8,San Salvador at 1st,37.330165,-121.88583100000001,15,San Jose,8/5/2013
9,Japantown,37.348742,-121.89471499999999,15,San Jose,8/5/2013
10,San Jose City Hall,37.337391,-121.886995,15,San Jose,8/6/2013


# 1 Найти велосипед с максимальным временем пробега.

In [7]:
tripsObjects = trips.mapPartitions(initTrip)

In [8]:
bike_top = tripsObjects.map(lambda trip: (trip.bike_id, trip.duration))\
                  .reduceByKey(lambda a, b: a + b)\
                  .top(1, key = lambda x: x[1])[0][0]
print("Id велосипеда с максимальным временем пробега: ", bike_top)

Id велосипеда с максимальным временем пробега:  535


# 2 Найти наибольшее геодезическое расстояние между станциями.

In [9]:
import math
def degreesToRadians(degrees):
    return math.pi/180*degrees

def distanceInKmBetweenEarthCoordinates(lat1, lon1, lat2, lon2):
    earthRadiusKm = 6371

    dLat = degreesToRadians(lat2-lat1)
    dLon = degreesToRadians(lon2-lon1)

    lat1 = degreesToRadians(lat1)
    lat2 = degreesToRadians(lat2)

    a = math.sin(dLat/2) * math.sin(dLat/2) + math.sin(dLon/2) * math.sin(dLon/2) * math.cos(lat1) * math.cos(lat2)
    angle = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    return earthRadiusKm * angle

In [10]:
stationObjects = stations.mapPartitions(initStation)

In [11]:
all_names = stationObjects.map(lambda row: (1, (row.station_id, row.name, row.lat, row.long)))
answer = all_names.join(all_names)\
               .map(lambda row: row[1])\
      .filter(lambda row: row[0][0]<row[1][0])\
      .map(lambda row: ((row[0][1], row[1][1]),distanceInKmBetweenEarthCoordinates(row[0][2], row[0][3], row[1][2], row[1][3])))\
      .sortBy(lambda x: x[1], ascending=False)\
      .first()
print(answer)
print("Самое большое расстояние между станциями '%s' и '%s и равно %f км" %(answer[0][0], answer[0][1],answer[1]))

(('SJSU - San Salvador at 9th', 'Embarcadero at Sansome'), 69.9208759542826)
Самое большое расстояние между станциями 'SJSU - San Salvador at 9th' и 'Embarcadero at Sansome и равно 69.920876 км


# 3 Найти путь велосипеда с максимальным временем пробега через станции.

In [12]:
allTheWay = tripsObjects.filter(lambda trip: trip.bike_id == bike_top)\
                        .sortBy(lambda trip: trip.start_date)\
                        .map(lambda trip: (trip.start_station_name, trip.end_station_name)) 

print("Всего поездок: ", allTheWay.count())
print("\n\n")
for trip in allTheWay.collect():
    print(trip[0], " -> ", trip[1])

Всего поездок:  1328



Post at Kearney  ->  San Francisco Caltrain (Townsend at 4th)
San Francisco Caltrain (Townsend at 4th)  ->  San Francisco Caltrain 2 (330 Townsend)
San Francisco Caltrain 2 (330 Townsend)  ->  Market at Sansome
Market at Sansome  ->  2nd at South Park
2nd at Townsend  ->  Davis at Jackson
San Francisco City Hall  ->  Civic Center BART (7th at Market)
Civic Center BART (7th at Market)  ->  Post at Kearney
Post at Kearney  ->  Embarcadero at Sansome
Embarcadero at Sansome  ->  Washington at Kearney
Washington at Kearney  ->  Market at Sansome
Market at Sansome  ->  Market at Sansome
Market at Sansome  ->  2nd at Folsom
2nd at Folsom  ->  2nd at Townsend
Temporary Transbay Terminal (Howard at Beale)  ->  2nd at Townsend
2nd at Townsend  ->  Embarcadero at Sansome
Embarcadero at Sansome  ->  Clay at Battery
Clay at Battery  ->  Harry Bridges Plaza (Ferry Building)
Harry Bridges Plaza (Ferry Building)  ->  Clay at Battery
Clay at Battery  ->  San Francisco Caltrain (

Spear at Folsom  ->  Broadway St at Battery St
Broadway St at Battery St  ->  Broadway St at Battery St
Broadway St at Battery St  ->  Steuart at Market
San Francisco Caltrain (Townsend at 4th)  ->  Townsend at 7th
Townsend at 7th  ->  San Francisco Caltrain (Townsend at 4th)
San Francisco Caltrain (Townsend at 4th)  ->  5th at Howard
5th at Howard  ->  San Francisco Caltrain (Townsend at 4th)
San Francisco Caltrain (Townsend at 4th)  ->  Steuart at Market
Steuart at Market  ->  Clay at Battery
Clay at Battery  ->  Embarcadero at Folsom
Embarcadero at Folsom  ->  Embarcadero at Sansome
Embarcadero at Sansome  ->  Harry Bridges Plaza (Ferry Building)
Harry Bridges Plaza (Ferry Building)  ->  2nd at Townsend
2nd at Townsend  ->  Beale at Market
Beale at Market  ->  Broadway St at Battery St
Broadway St at Battery St  ->  Clay at Battery
Clay at Battery  ->  San Francisco Caltrain (Townsend at 4th)
2nd at South Park  ->  2nd at South Park
2nd at South Park  ->  San Francisco Caltrain (Tow

2nd at Folsom  ->  San Francisco Caltrain (Townsend at 4th)
5th at Howard  ->  San Francisco Caltrain (Townsend at 4th)
San Francisco Caltrain (Townsend at 4th)  ->  Harry Bridges Plaza (Ferry Building)
Harry Bridges Plaza (Ferry Building)  ->  2nd at Townsend
2nd at Townsend  ->  San Francisco Caltrain (Townsend at 4th)
San Francisco Caltrain (Townsend at 4th)  ->  Post at Kearny
Post at Kearny  ->  San Francisco Caltrain (Townsend at 4th)
San Francisco Caltrain (Townsend at 4th)  ->  Powell Street BART
Powell Street BART  ->  Powell Street BART
Powell Street BART  ->  Embarcadero at Sansome
Temporary Transbay Terminal (Howard at Beale)  ->  Broadway St at Battery St
Broadway St at Battery St  ->  Mechanics Plaza (Market at Battery)
Embarcadero at Folsom  ->  San Francisco Caltrain (Townsend at 4th)
San Francisco Caltrain (Townsend at 4th)  ->  South Van Ness at Market
South Van Ness at Market  ->  5th at Howard
5th at Howard  ->  Townsend at 7th
Townsend at 7th  ->  Post at Kearny
Po

# 4 Найти количество велосипедов в системе.

In [13]:
print("Количество велосипедов:", tripsObjects.map(lambda trip: trip.bike_id).distinct().count())

Количество велосипедов: 700


# 5 Найти пользователей потративших на поездки более 3 часов.

In [14]:
threeHours = 3 * 60 * 60

query = tripsObjects.filter(lambda trip: trip.duration > threeHours).map(lambda trip: trip.bike_id)
print("Количество: ", query.count())
print(query.collect())

Количество:  8322
[433, 377, 645, 434, 501, 614, 464, 150, 460, 390, 269, 141, 484, 458, 638, 640, 627, 552, 608, 439, 416, 95, 587, 257, 353, 412, 608, 594, 388, 572, 613, 580, 385, 559, 320, 391, 428, 387, 541, 471, 396, 277, 52, 350, 381, 348, 488, 536, 410, 537, 342, 583, 259, 369, 406, 427, 142, 259, 416, 468, 339, 511, 272, 572, 568, 467, 487, 260, 325, 502, 94, 279, 524, 408, 436, 317, 544, 333, 614, 345, 401, 625, 437, 367, 332, 426, 515, 581, 446, 444, 585, 616, 338, 277, 320, 385, 576, 606, 12, 528, 484, 421, 540, 517, 30, 409, 372, 394, 466, 473, 568, 572, 592, 553, 472, 493, 612, 520, 501, 321, 586, 512, 555, 487, 412, 386, 467, 524, 521, 630, 370, 205, 217, 415, 326, 361, 567, 316, 464, 549, 547, 610, 427, 498, 501, 523, 572, 100, 160, 669, 612, 511, 433, 630, 374, 572, 617, 507, 272, 365, 484, 447, 575, 432, 611, 388, 466, 440, 505, 421, 279, 334, 547, 486, 613, 568, 510, 275, 552, 540, 556, 548, 359, 387, 466, 388, 441, 622, 360, 586, 481, 485, 103, 608, 413, 582, 24, 44