# DATA CLEANING

In [2]:
import pandas as pd
from datetime import timedelta, date
import matplotlib.pyplot as plt
import numpy as np
import math
from numpy import radians as rad
from numpy import sin, cos, sqrt, arcsin

In [3]:
df = pd.read_csv('../Data/AIS_LA_SD_Jan_1_to_15_2016_Filtered_by_Proximity.csv')

In [4]:
#This converts the time metric to a datetime object that can be easily subtracted
df['Timestamp_1'] = pd.to_datetime(df['Timestamp_1'], errors='coerce') #coerce causes NaT for all out-of-bound dates
df['Timestamp_2'] = pd.to_datetime(df['Timestamp_2'], errors='coerce') #coerce causes NaT for all out-of-bound dates
#df['BaseDateTime'] = pd.to_datetime(df['BaseDateTime'])

In [5]:
df.head()

Unnamed: 0,ID,MMSI_1,Timestamp_1,LAT_1,LON_1,SOG_1,COG_1,Heading_1,Vessel Name_1,IMO_1,...,Heading_2,Vessel Name_2,IMO_2,Call Sign_2,Vessel Type_2,Status_2,Length_2,Width_2,Draft_2,Cargo_2
0,271,338115083,2016-01-01 00:00:04,33.60957,-117.88857,0.0,0.0,511,VILLAGE BLACKSMITH,,...,511,BUCKET LIST,,,1019.0,,11.55,4.0,,
1,272,338115083,2016-01-01 00:00:04,33.60957,-117.88857,0.0,0.0,511,VILLAGE BLACKSMITH,,...,511,,,,,,,,,
2,285,367344710,2016-01-01 00:00:16,33.61489,-117.91451,0.1,-167.7,28,SERENGETI,,...,24,KIMBERLY,,V7NN2,1019.0,moored,31.88,7.3,,37.0
3,314,338138623,2016-01-01 00:00:17,34.40718,-119.6917,0.0,100.4,511,GINGER,,...,511,COJO,,,1001.0,,,,,
4,315,338138623,2016-01-01 00:00:17,34.40718,-119.6917,0.0,100.4,511,GINGER,,...,511,CONCEPTION,,WYR8548,1012.0,,22.86,7.6,,


In [6]:
# Connect each row of data with each other to show a close proximity over time between two ships
sorted_df = df.sort_values(by=['MMSI_1', 'MMSI_2', 'Timestamp_1', 'Timestamp_2']).reset_index(drop = True)
#print(sorted_df.iloc[0])
grouped_df = sorted_df.groupby(['MMSI_1','MMSI_2'])['Timestamp_1'].count().reset_index(name="count")
#How many have only one data point for the proximity alert?
print(grouped_df.loc[grouped_df["count"]==1].shape) #potential single instances that we have to make a determination if a COLREG or not

(1517, 3)


In [40]:
#TESTING A INSTANCE OF OIL TANKERS THAT HAVE 552 CASES OF BEING IN CLOSE PROXIMITY TO ONE ANOTHER
out_path = '../Data/oil_tanker_question.xlsx'
subset_tankers = sorted_df.loc[(sorted_df['MMSI_1'] == 636014465) & (sorted_df['MMSI_2'] == 636014804)]
writer = pd.ExcelWriter(out_path)
subset_tankers.to_excel(writer, 'oil_tankers')
writer.save()
# Multiple pings for each ship within one half hour -- reduce to "final location"
#sorted_df.sort_values(by = ['BaseDateTime_Rounded', 'MMSI'])

# Filter out instances that are definitely NOT collision potential situations

In [8]:
#TESTING DISTANCE
R = 6372.8
def haversine(coord1, coord2):
    dLat = radians(coord2[0] - coord1[0])
    dLon = radians(coord2[1]-coord1[1])
    lat1 = radians(coord1[0])
    lat2 = radians(coord2[0])
    a = sin(dLat/2)**2+cos(lat1)*cos(lat2)*sin(dLon/2)**2
    c=2*asin(sqrt(a))
    return R*c

#Find out if the ships fall under these safe circumstances. 
#If so, then we should remove them from potential collision issues
print(sorted_df.shape)

sorted_df['LAT_1_rad'] = rad(sorted_df['LAT_1'])
sorted_df['LAT_2_rad'] = rad(sorted_df['LAT_2'])
sorted_df['LON_1_rad'] = rad(sorted_df['LON_1'])
sorted_df['LON_2_rad'] = rad(sorted_df['LON_2'])
sorted_df['dLAT'] = sorted_df['LAT_2_rad']-sorted_df['LAT_1_rad'] 
sorted_df['dLON'] = sorted_df['LON_2_rad']-sorted_df['LON_1_rad'] 

sorted_df['distance'] = R*2*arcsin(sqrt(sin(sorted_df['dLAT']/2)**2+cos(sorted_df['LAT_1_rad'])*cos(sorted_df['LAT_2_rad'])*sin(sorted_df['dLON']/2)**2))
sorted_df['distance'].head()

(86552, 33)


0    0.233319
1    0.834871
2    0.656948
3    0.594427
4    0.527467
Name: distance, dtype: float64

In [9]:
#When both ships are stationary, we do not have a potential collision so we can remove those from our dataset
sorted_df_moving = sorted_df.loc[(sorted_df['SOG_1'] > 0.1) & (sorted_df['SOG_2']>0.1)]
print(sorted_df_moving.shape)

(10504, 40)


In [10]:
#when both ships are moving in the same direction at the same speed, they are not in danger of colliding
crit1 = abs(sorted_df_moving['COG_1']-sorted_df_moving['COG_2'])>=1
crit2 = abs(sorted_df_moving['SOG_1']-sorted_df_moving['SOG_2'])>=0.1
sorted_df_moving_not_together = sorted_df_moving[crit1 | crit2]
print(sorted_df_moving_not_together.shape)

(9934, 40)


# Now find COLREGs

In [11]:
#calculate the distance between two ships and their new lats and longs
sorted_df_moving_not_together["Y_CORD_1"] = np.cos(sorted_df_moving_not_together['COG_1']) #need to swap x and y coordinates from normal unit circle because of COG set up where the true north
sorted_df_moving_not_together["X_CORD_1"] = np.sin(sorted_df_moving_not_together['COG_1'])
sorted_df_moving_not_together["Y_CORD_2"] = np.cos(sorted_df_moving_not_together['COG_2']) #need to swap x and y coordinates from normal unit circle because of COG set up where the true north
sorted_df_moving_not_together["X_CORD_2"] = np.sin(sorted_df_moving_not_together['COG_2'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [12]:
sorted_df_moving_not_together["Y_CORD_1"].head()

0    0.264931
1    0.976011
2   -0.141984
3    0.543288
4   -0.846212
Name: Y_CORD_1, dtype: float64

In [13]:
sorted_df_moving_not_together['LAT_1_n'] = sorted_df_moving_not_together['LAT_1'] + (180/math.pi)*(sorted_df_moving_not_together["Y_CORD_1"]/R) #Earth Radius
sorted_df_moving_not_together['LON_1_n'] = sorted_df_moving_not_together['LON_1'] + np.divide((180/math.pi)*(sorted_df_moving_not_together["X_CORD_1"]/R), np.cos(sorted_df_moving_not_together['LON_1']*math.pi/180)) 
sorted_df_moving_not_together['LAT_2_n'] = sorted_df_moving_not_together['LAT_2'] + (180/math.pi)*(sorted_df_moving_not_together["Y_CORD_2"]/R) #Earth Radius
sorted_df_moving_not_together['LON_2_n'] = sorted_df_moving_not_together['LON_2'] + np.divide((180/math.pi)*(sorted_df_moving_not_together["X_CORD_2"]/R), np.cos(sorted_df_moving_not_together['LON_2']*math.pi/180)) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [16]:
sorted_df_moving_not_together['LAT_1_rad'] = rad(sorted_df_moving_not_together['LAT_1'])
sorted_df_moving_not_together['LON_1_rad'] = rad(sorted_df_moving_not_together['LON_1'])
sorted_df_moving_not_together['LAT_2_rad'] = rad(sorted_df_moving_not_together['LAT_2'])
sorted_df_moving_not_together['LON_2_rad'] = rad(sorted_df_moving_not_together['LON_2'])
sorted_df_moving_not_together['LAT_1_n_rad'] = rad(sorted_df_moving_not_together['LAT_1_n'])
sorted_df_moving_not_together['LON_1_n_rad'] = rad(sorted_df_moving_not_together['LON_1_n'])
sorted_df_moving_not_together['LAT_2_n_rad'] = rad(sorted_df_moving_not_together['LAT_2_n'])
sorted_df_moving_not_together['LON_2_n_rad'] = rad(sorted_df_moving_not_together['LON_2_n'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [17]:
#figure out distance between the two new points of 1 and 2
sorted_df_moving_not_together['dLAT_n'] = sorted_df_moving_not_together['LAT_2_n_rad']-sorted_df_moving_not_together['LAT_1_n_rad'] 
sorted_df_moving_not_together['dLON_n'] = sorted_df_moving_not_together['LON_2_n_rad']-sorted_df_moving_not_together['LON_1_n_rad'] 
sorted_df_moving_not_together['dist_comp_n_1_2'] = R*2*arcsin(sqrt(sin(sorted_df_moving_not_together['dLAT_n']/2)**2+cos(sorted_df_moving_not_together['LAT_1_n_rad'])*cos(sorted_df_moving_not_together['LAT_2_n_rad'])*sin(sorted_df_moving_not_together['dLON_n']/2)**2))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [18]:
sorted_df_moving_not_together['dLAT'] = sorted_df_moving_not_together['LAT_2_rad']-sorted_df_moving_not_together['LAT_1_rad'] 
sorted_df_moving_not_together['dLON'] = sorted_df_moving_not_together['LON_2_rad']-sorted_df_moving_not_together['LON_1_rad'] 
sorted_df_moving_not_together['distance'] = R*2*arcsin(sqrt(sin(sorted_df_moving_not_together['dLAT']/2)**2+cos(sorted_df_moving_not_together['LAT_1_rad'])*cos(sorted_df_moving_not_together['LAT_2_rad'])*sin(sorted_df_moving_not_together['dLON']/2)**2))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
sorted_df_moving_not_together['dLAT1_n'] = sorted_df_moving_not_together['LAT_1_n_rad']-sorted_df_moving_not_together['LAT_1_rad'] 
sorted_df_moving_not_together['dLON1_n'] = sorted_df_moving_not_together['LON_1_n_rad']-sorted_df_moving_not_together['LON_1_rad'] 
sorted_df_moving_not_together['dist_comp'] = R*2*arcsin(sqrt(sin(sorted_df_moving_not_together['dLAT1_n']/2)**2+cos(sorted_df_moving_not_together['LAT_1_rad'])*cos(sorted_df_moving_not_together['LAT_1_n_rad'])*sin(sorted_df_moving_not_together['dLON1_n']/2)**2))
#THIS SCROLLER IS HARD TO SELECT THINGS SO I ADD AN EXTRA LINE

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
print(sorted_df_moving_not_together['distance'][10:20])
print(sorted_df_moving_not_together['dist_comp_n_1_2'][10:20])
print(sorted_df_moving_not_together['dist_comp'][10:20])


10    0.286085
11    0.726211
12    0.262377
13    0.840877
14    0.851530
15    0.179218
16    0.854552
17    0.873416
18    0.214462
19    0.836470
Name: distance, dtype: float64
10    1.328000
11    2.447797
12    1.482402
13    2.448507
14    2.993372
15    1.357284
16    2.935380
17    0.965500
18    2.325250
19    2.165764
Name: dist_comp_n_1_2, dtype: float64
10    1.250361
11    1.158184
12    1.717650
13    1.806878
14    1.718191
15    1.351348
16    1.720473
17    1.498293
18    1.021732
19    1.000152
Name: dist_comp, dtype: float64


In [21]:
sorted_df_moving_not_together['COLREG'] = 'NO' #initialize

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [22]:
grouped_df = sorted_df_moving_not_together.groupby('COLREG')['COLREG'].size().reset_index(name="count")
#How many have only one data point for the proximity alert?
print(grouped_df)

  COLREG  count
0     NO   9934


In [23]:
#when the ships are moving towards one another in their course -- crossing manner
crit = sorted_df_moving_not_together['dist_comp_n_1_2'] <  sorted_df_moving_not_together['distance'] #the new distance
#crossing_df = sorted_df_moving_not_together[crit]
#crossing_df.shape
sorted_df_moving_not_together.loc[crit, 'COLREG'] = 'CROSSING'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [24]:
grouped_df = sorted_df_moving_not_together.groupby('COLREG')['COLREG'].size().reset_index(name="count")
#How many have only one data point for the proximity alert?
print(grouped_df)

     COLREG  count
0  CROSSING    497
1        NO   9437


In [25]:
#now can find head on collisions (transform all crossings into head-on collisions)
CRIT = abs(np.mod(sorted_df_moving_not_together['COG_1'] + 180, 360)-sorted_df_moving_not_together['COG_2'])<=5 #about to hit each other
CRIT2 = sorted_df_moving_not_together['COLREG'] == 'CROSSING'
sorted_df_moving_not_together.loc[CRIT & CRIT2, 'COLREG'] = 'HEAD-ON'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [26]:
grouped_df = sorted_df_moving_not_together.groupby('COLREG')['COLREG'].size().reset_index(name="count")
#How many have only one data point for the proximity alert?
print(grouped_df) #possible crossings are head on collisions therefore they transitioned to head-on category

     COLREG  count
0  CROSSING    492
1   HEAD-ON      5
2        NO   9437


In [34]:
CRIT1 = sorted_df_moving_not_together['dist_comp']>sorted_df_moving_not_together['distance']

sorted_df_moving_not_together['ship1_in_front'] = 'no'
sorted_df_moving_not_together.loc[CRIT1, 'ship1_in_front'] = 'yes'

grouped_df = sorted_df_moving_not_together.groupby('ship1_in_front')['ship1_in_front'].size().reset_index(name="count")
#How many have only one data point for the proximity alert?
print(grouped_df)

  ship1_in_front  count
0            yes   9934


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [35]:
#need to fix the problem where overtake is a parallel path and NOT on the same path
sorted_df_moving_not_together['FRONT_LON'] = 0.0
sorted_df_moving_not_together['FRONT_LAT'] = 0.0
sorted_df_moving_not_together['Y_INT'] = 0.0
sorted_df_moving_not_together['same_path'] = 'no' #the only value that matters is "yes" when I need this column

sorted_df_moving_not_together['SLOPE'] = np.divide(sorted_df_moving_not_together['LAT_1_n'] - sorted_df_moving_not_together['LAT_2_n'], sorted_df_moving_not_together['LON_1_n'] - sorted_df_moving_not_together['LON_2_n'])

crit1 = sorted_df_moving_not_together['ship1_in_front'] == 'yes' #then ship 1 is in front
crit2 = sorted_df_moving_not_together['ship1_in_front'] == 'no' #then ship 2 in is front

sorted_df_moving_not_together.loc[crit1, 'FRONT_LON'] = sorted_df_moving_not_together['LON_1']
sorted_df_moving_not_together.loc[crit2, 'FRONT_LON'] = sorted_df_moving_not_together['LON_2']

sorted_df_moving_not_together.loc[crit1, 'FRONT_LAT'] = sorted_df_moving_not_together['LAT_1']
sorted_df_moving_not_together.loc[crit2, 'FRONT_LAT'] = sorted_df_moving_not_together['LAT_2']

sorted_df_moving_not_together.loc[crit1, 'Y_INT'] = sorted_df_moving_not_together['LAT_2_n'] #oppositive for back lat
sorted_df_moving_not_together.loc[crit2, 'Y_INT'] = sorted_df_moving_not_together['LAT_1_n']

sorted_df_moving_not_together['PROJ_LAT'] = np.multiply(sorted_df_moving_not_together['SLOPE'], sorted_df_moving_not_together['FRONT_LON']) + sorted_df_moving_not_together['Y_INT']
crit = abs(sorted_df_moving_not_together['PROJ_LAT']-sorted_df_moving_not_together['FRONT_LAT'])<=0.1

sorted_df_moving_not_together.loc[crit, 'same_path'] = 'yes'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [36]:
#what ship is in front?
sorted_df_moving_not_together['ship1_in_front'] = 'no' #initialize
CRIT1 = sorted_df_moving_not_together['dist_comp'] > sorted_df_moving_not_together['distance'] #if yes then ship1 is in front, if no then ship2 is in front
sorted_df_moving_not_together.loc[CRIT1, 'ship1_in_front'] = 'yes'

#now find when one is overtaking the other
crit1 = abs(sorted_df_moving_not_together['COG_1']-sorted_df_moving_not_together['COG_2'])<=5 #going in the same direction
crit2a = sorted_df_moving_not_together['ship1_in_front']=='yes' #if ship1 is in front then ship2 could overtake
crit2b = sorted_df_moving_not_together['SOG_2']>sorted_df_moving_not_together['SOG_1'] #so check ship2 speed to see if faster
crit3a = sorted_df_moving_not_together['ship1_in_front']=='no' #else if ship2 is in front then ship1 can overtake
crit3b = sorted_df_moving_not_together['SOG_2']<sorted_df_moving_not_together['SOG_1'] #check that ship1's speed is not faster than ship2
crit4 = sorted_df_moving_not_together['same_path'] == 'yes'
sorted_df_moving_not_together.loc[crit1 & ((crit2a & crit2b) | (crit3a & crit3b)) & crit4, 'COLREG'] = 'OVERTAKE'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [37]:
grouped_df = sorted_df_moving_not_together.groupby('COLREG')['COLREG'].size().reset_index(name="count")
#How many have only one data point for the proximity alert?
print(grouped_df)

     COLREG  count
0  CROSSING    492
1   HEAD-ON      5
2        NO   9435
3  OVERTAKE      2


In [39]:
CRIT = sorted_df_moving_not_together['COLREG']=='OVERTAKE'
head_on_df = sorted_df_moving_not_together[CRIT]
#head_on_df.shape
#print(head_on_df)
head_on_df[['MMSI_1','MMSI_2', 'LAT_1', 'LON_1', 'LAT_2', 'LON_2','COG_1', 'COG_2', 'SOG_1', 'SOG_2', 'ship1_in_front', 'same_path']]

Unnamed: 0,MMSI_1,MMSI_2,LAT_1,LON_1,LAT_2,LON_2,COG_1,COG_2,SOG_1,SOG_2,ship1_in_front,same_path
3256,239998000,538002090,30.77432,-117.31007,30.7745,-117.31103,122.0,123.0,1.1,1.3,yes,yes
85586,636014465,636014804,32.04069,-119.00154,32.03927,-119.00065,116.7,115.5,5.1,5.3,yes,yes


In [31]:
CRIT = sorted_df_moving_not_together['COLREG']=='CROSSING'
head_on_df = sorted_df_moving_not_together[CRIT]
#head_on_df.shape
#print(head_on_df)
head_on_df[['MMSI_1','MMSI_2', 'LAT_1', 'LON_1', 'LAT_2', 'LON_2','COG_1', 'COG_2', 'SOG_1', 'SOG_2']][0:3]

Unnamed: 0,MMSI_1,MMSI_2,LAT_1,LON_1,LAT_2,LON_2,COG_1,COG_2,SOG_1,SOG_2
330,201216315,309933000,31.82658,-116.66772,31.82623,-116.67341,-172.7,91.6,10.1,7.1
334,201216315,309933000,31.83499,-116.64151,31.83063,-116.63458,61.4,74.2,9.9,10.2
402,212871000,239998000,31.30923,-118.19778,31.3095,-118.19777,140.0,140.0,2.0,1.9


In [32]:
CRIT = sorted_df_moving_not_together['COLREG']=='HEAD-ON'
head_on_df = sorted_df_moving_not_together[CRIT]
#head_on_df.shape
#print(head_on_df)
head_on_df[['MMSI_1','MMSI_2', 'LAT_1', 'LON_1', 'LAT_2', 'LON_2','COG_1', 'COG_2', 'SOG_1', 'SOG_2']][0:3]

Unnamed: 0,MMSI_1,MMSI_2,LAT_1,LON_1,LAT_2,LON_2,COG_1,COG_2,SOG_1,SOG_2
53611,338184311,367008510,33.3469,-118.32435,33.34456,-118.32213,-154.6,20.9,0.2,0.2
56858,343961455,538005177,32.34043,-117.09482,32.34246,-117.09623,-143.4,39.0,0.9,0.2
64242,367047150,367645170,46.04823,-118.94959,46.04156,-118.94978,192.1,10.5,7.9,7.9


In [None]:
#save what we have to visualize
out_path = '../Data/OUTPUT.xlsx'
writer = pd.ExcelWriter(out_path)
sorted_df_moving_not_together.to_excel(writer, 'COLREG_all') #changed from COLREG_cross_head_on
writer.save()